# Essay gap

Solution author: Asandei Stefan-Alexandru

In [None]:
import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn as nn

In [2]:
root_path = "/home/stefan/ioai-prep/kits/essay-gap"
device = "cuda" if torch.cuda.is_available() else "cpu"

seed = 42
torch.random.manual_seed(seed)

<torch._C.Generator at 0x7fe79f938950>

# Model

In [3]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(
    device
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Dataset

In [16]:
train_df = pd.read_csv(f"{root_path}/train.csv")

train_df.head()

Unnamed: 0,sampleID,before,after,opt_0,opt_1,opt_2,opt_3,label
0,0,The life cycle of a Christmas tree from the se...,One issue that farmers face is the destruction...,The remaining development of the tree greatly ...,The belief in the divinity of Jesus leads to t...,Essentially the recipe brings together what tr...,It is a matter of some debate as to which was ...,0
1,1,Slopes flatter than 25 degrees or steeper than...,The rule of thumb is: A slope that is flat eno...,In her 1850 book The First Christmas in New En...,"In Latin America and the Iberian Peninsula, th...","On steeper slopes, this can occur with as litt...",When the incidence of human triggered avalanch...,3
2,2,"Most workplaces conduct a ""Christmas Party"" so...","Likewise, schools, TAFE (vocational training),...",As many people take their holidays between Chr...,The frequency with which avalanches form in a ...,"In doing so, they employ on-the-ground physica...",The area in and around the basilica begins to ...,0
3,3,The Chronography of 354 illuminated manuscript...,By around 385 the feast for the birth of Jesus...,The eastern inland region where the country is...,In a sermon delivered in Antioch on December 2...,This remains one of the most extensive such ma...,"A cold front, the leading edge of a cooler mas...",1
4,4,English personifications of Christmas were fir...,His character was maintained during the late 1...,"In a sermon in 386, Gregory of Nyssa specifica...",The first evidence of decorated trees associat...,"Following the Restoration in 1660, Father Chri...","In 614, the Persian Sassanid Empire, supported...",2


In [12]:
class EssayDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        before = row["before"]
        after = row["after"]
        if "label" in row:
            label = row["label"]

        encodings = []
        for i in range(4):
            option = row[f"opt_{i}"]
            text = f"{before} [SEP] {option} [SEP] {after}"

            enc = self.tokenizer(
                text,
                max_length=self.max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            encodings.append(
                {
                    "input_ids": enc["input_ids"].squeeze(),
                    "attention_mask": enc["attention_mask"].squeeze(),
                }
            )

        if "label" in row:
            return {
                "input_ids": torch.stack([e["input_ids"] for e in encodings]),
                "attention_mask": torch.stack([e["attention_mask"] for e in encodings]),
                "label": torch.tensor(label, dtype=torch.long),
            }
        return {
            "input_ids": torch.stack([e["input_ids"] for e in encodings]),
            "attention_mask": torch.stack([e["attention_mask"] for e in encodings]),
        }

In [7]:
batch_size = 8
train_dataset = EssayDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training

In [8]:
epochs = 3
lr = 2e-5

optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

In [9]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Shape: (batch_size, 4, seq_len)
        batch_size, num_options, seq_len = input_ids.shape

        # Reshape to (batch_size * 4, seq_len)
        input_ids_flat = input_ids.view(batch_size * num_options, seq_len)
        attention_mask_flat = attention_mask.view(batch_size * num_options, seq_len)

        outputs = model(input_ids=input_ids_flat, attention_mask=attention_mask_flat)
        logits = outputs.logits  # (batch_size * 4, 4)

        # Reshape back and take logits for each option
        logits = logits.view(batch_size, num_options, -1)
        logits = logits[:, :, 0]  # Take class 0 logits for each option

        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Loss: {avg_loss:.4f}")

Epoch 1: 100%|██████████| 40/40 [00:29<00:00,  1.38it/s]


Epoch 1 - Loss: 1.2285


Epoch 2: 100%|██████████| 40/40 [00:29<00:00,  1.37it/s]


Epoch 2 - Loss: 0.6299


Epoch 3: 100%|██████████| 40/40 [00:28<00:00,  1.39it/s]

Epoch 3 - Loss: 0.1954





# Submission

In [None]:
test_df = pd.read_csv(f"{root_path}/test.csv")

In [13]:
model.eval()
predictions = []

test_dataset = EssayDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        batch_size, num_options, seq_len = input_ids.shape
        input_ids_flat = input_ids.view(batch_size * num_options, seq_len)
        attention_mask_flat = attention_mask.view(batch_size * num_options, seq_len)

        outputs = model(input_ids=input_ids_flat, attention_mask=attention_mask_flat)
        logits = outputs.logits.view(batch_size, num_options, -1)[:, :, 0]

        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

Predicting: 100%|██████████| 10/10 [00:02<00:00,  3.70it/s]


In [14]:
submission = pd.DataFrame({"sampleID": test_df["sampleID"], "answer": predictions})
submission.head()

Unnamed: 0,sampleID,answer
0,100,0
1,101,0
2,102,2
3,103,0
4,104,3


In [15]:
submission.to_csv("submission.csv", index=False)