In [1]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load training data 
train_df = pd.read_csv("data/train.csv")

# Add a column with the labels
def get_label(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1
    else:
        return 2

train_df['label'] = train_df.apply(get_label, axis=1)

In [None]:
# Tokenize
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Add a column with the prompt and the responses 
train_df['text'] = train_df.apply(lambda x: f"{x['prompt']} [SEP] {x['response_a']} [SEP] {x['response_b']}", axis=1)

In [None]:
# Class to prepare the data for Pytorch
class KaggleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Split the data into train and validation
X_train, y_train, X_test, y_test = train_test_split(train_df['text'], train_df['label'], test_size=0.2, random_state=42)

# Create the datasets
train_dataset = KaggleDataset(X_train.tolist(), X_test.tolist(), tokenizer)
val_dataset = KaggleDataset(y_train.tolist(), y_test.tolist(), tokenizer)


In [None]:
#Load distilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Train configuration
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    eval_strategy="epoch"            
)

# Define evaluation metrics
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0996,1.09884,0.341771,0.174109
2,1.0883,1.087868,0.376131,0.261923
3,1.0825,1.082209,0.386134,0.295979


TrainOutput(global_step=17244, training_loss=1.0931815189652696, metrics={'train_runtime': 3521.5242, 'train_samples_per_second': 39.171, 'train_steps_per_second': 4.897, 'total_flos': 1.827327624493363e+16, 'train_loss': 1.0931815189652696, 'epoch': 3.0})

In [None]:
# Load the test data
test_df = pd.read_csv("data/test.csv")

# Prepare the test data
test_df['text'] = test_df.apply(lambda x: f"{x['prompt']} [SEP] {x['response_a']} [SEP] {x['response_b']}", axis=1)

# Prepare the test dataset for Pytorch
test_dataset = KaggleDataset(test_df['text'].tolist(), [0]*len(test_df), tokenizer)

In [None]:
# Predict the test data
predictions = trainer.predict(test_dataset)
logits = predictions.predictions

# Transform the logits into probabilities
probs = softmax(torch.tensor(logits), dim=1).numpy()

In [None]:
# Create submission dataframe
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "winner_model_a": probs[:, 0],
    "winner_model_b": probs[:, 1],
    "winner_tie": probs[:, 2]
})

# Save the submission
submission_df.to_csv("submission.csv", index=False)