In [None]:
import importlib
import subprocess
import sys

def ensure_package(pkg):
    try:
        importlib.import_module(pkg)
        print(f"{pkg} is already installed")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

for package in ["protobuf", "tiktoken", "sentencepiece"]:
    ensure_package(package)

⬇️ Installing protobuf...
✅ tiktoken is already installed
✅ sentencepiece is already installed


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForMultipleChoice, 
    TrainingArguments, 
    Trainer,
    DataCollatorForMultipleChoice,
)
from sklearn.model_selection import train_test_split

In [None]:
root_path = "./essay-gap"
MODEL_NAME = "microsoft/deberta-v3-large"
MAX_LEN = 256
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4 # 2x4=8 effective batch size
EPOCHS = 5
LEARNING_RATE = 5e-6
WEIGHT_DECAY = 0.01
EPOCHS = 4

# Dataset

In [None]:
class EssayGapDataset(Dataset):
    def __init__(self, df, tokenizer, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
                
        first_sentences = [str(row['before'])] * 4
        second_sentences = [
            str(row[f'opt_{i}']) + " " + str(row['after']) for i in range(4)
        ]

        tokenized_examples = self.tokenizer(
            first_sentences, 
            second_sentences, 
            truncation=True, 
            max_length=MAX_LEN, 
            padding="max_length"
        )
        
        batch = {k: v for k, v in tokenized_examples.items()}
        
        if not self.is_test:
            batch['label'] = int(row['label'])
        else:
            batch['label'] = 0
            
        return batch

In [24]:
train_df_full = pd.read_csv(f"{root_path}/train.csv")
test_df = pd.read_csv(f"{root_path}/test.csv")
train_df, val_df = train_test_split(train_df_full, test_size=0.1, random_state=42)
len(train_df), len(val_df), len(test_df)

(288, 32, 80)

# Model

In [25]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME)

train_dataset = EssayGapDataset(train_df, tokenizer)
val_dataset = EssayGapDataset(val_df, tokenizer)
test_dataset = EssayGapDataset(test_df, tokenizer, is_test=True)

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    fp16=True,
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=1,
    warmup_ratio=0.1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
)

print("Starting training...")
trainer.train()

  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss
1,No log,0.355305
2,No log,0.437507
3,No log,0.122964
4,No log,0.153565
5,No log,0.135892


TrainOutput(global_step=360, training_loss=0.025969401995340983, metrics={'train_runtime': 102.3168, 'train_samples_per_second': 14.074, 'train_steps_per_second': 3.518, 'total_flos': 381512894054400.0, 'train_loss': 0.025969401995340983, 'epoch': 5.0})

In [31]:
print("Predicting on test set...")
predictions = trainer.predict(test_dataset)
preds_indices = np.argmax(predictions.predictions, axis=1)

Predicting on test set...


# Submission

In [32]:
submission = pd.DataFrame({"sampleID": test_df["sampleID"], "answer": preds_indices})
submission.head()

Unnamed: 0,sampleID,answer
0,100,0
1,101,0
2,102,2
3,103,0
4,104,3


In [33]:
submission.to_csv("submission.csv", index=False)