In [1]:
from datasets import load_dataset, Dataset

In [2]:
train_ = load_dataset('csv', data_files = '/kaggle/input/lmsys-chatbot-arena/train.csv')
test_ = load_dataset('csv', data_files = '/kaggle/input/lmsys-chatbot-arena/test.csv')

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
train_, test_

(DatasetDict({
     train: Dataset({
         features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
         num_rows: 57477
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['id', 'prompt', 'response_a', 'response_b'],
         num_rows: 3
     })
 }))

In [4]:
import numpy as np

def process_text(text):
    text = ' '.join(text.split('"')[1:-1])
    return text

def prepare_data(sample):
    sample['pa'] = '<prompt> ' + process_text(sample['prompt']) + ' <response> ' + process_text(sample['response_a'])
    sample['pb'] = '<prompt> ' + process_text(sample['prompt']) + ' <response> ' + process_text(sample['response_b'])
    return sample

def prepare_labels(sample):
    sample['label'] = np.argmax([sample['winner_model_a'],sample['winner_tie'], sample['winner_model_b']])
    return sample

In [5]:
train_test_split = train_['train'].map(prepare_labels).train_test_split(test_size = 0.2)

Map:   0%|          | 0/57477 [00:00<?, ? examples/s]

In [6]:
train = train_test_split['train'].map(prepare_data).remove_columns(['id','model_a','model_b','prompt','response_a','response_b','winner_model_a','winner_tie','winner_model_b'])
val = train_test_split['test'].map(prepare_data).remove_columns(['id','model_a','model_b','prompt','response_a','response_b','winner_model_a','winner_tie','winner_model_b'])
test = test_['train'].map(prepare_data).remove_columns(['id','prompt','response_a','response_b'])

Map:   0%|          | 0/45981 [00:00<?, ? examples/s]

Map:   0%|          | 0/11496 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [7]:
train, val, test

(Dataset({
     features: ['label', 'pa', 'pb'],
     num_rows: 45981
 }),
 Dataset({
     features: ['label', 'pa', 'pb'],
     num_rows: 11496
 }),
 Dataset({
     features: ['pa', 'pb'],
     num_rows: 3
 }))

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_new_model():
    return AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = 3).to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
def process(batch):
    return tokenizer(batch['pa'], batch['pb'], truncation = True, padding = 'max_length', max_length = 512)


train_ds = train.map(process, batched = True)
val_ds = val.map(process, batched = True)
test_ds = test.map(process, batched = True)

Map:   0%|          | 0/45981 [00:00<?, ? examples/s]

Map:   0%|          | 0/11496 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [10]:
train_ds, val_ds, test_ds

(Dataset({
     features: ['label', 'pa', 'pb', 'input_ids', 'attention_mask'],
     num_rows: 45981
 }),
 Dataset({
     features: ['label', 'pa', 'pb', 'input_ids', 'attention_mask'],
     num_rows: 11496
 }),
 Dataset({
     features: ['pa', 'pb', 'input_ids', 'attention_mask'],
     num_rows: 3
 }))

In [11]:
import os 

os.environ['WANDB_DISABLED'] = 'true'

In [12]:
from sklearn.metrics import log_loss, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    logloss = log_loss(labels, preds)
    return {'log_loss' : logloss}

In [13]:
from transformers import TrainingArguments, Trainer

In [14]:
output_dir = '/kaggle/working/roberta_model'
batch_size = 32

args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    num_train_epochs = 2,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = 100,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    overwrite_output_dir = True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
torch.cuda.empty_cache()

In [16]:
trainer =  Trainer(
    model = get_new_model(),
    args = args,
    compute_metrics = compute_metrics,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    tokenizer = tokenizer
)

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Log Loss,Runtime,Samples Per Second,Steps Per Second
1,1.0684,1.063112,6.11923,112.4477,102.234,3.201
2,1.0173,1.036951,6.683072,112.193,102.466,3.209


TrainOutput(global_step=2874, training_loss=1.0575802828922152, metrics={'train_runtime': 2710.3359, 'train_samples_per_second': 33.93, 'train_steps_per_second': 1.06, 'total_flos': 1.2182184163289088e+16, 'train_loss': 1.0575802828922152, 'epoch': 2.0})

In [17]:
import shutil

shutil.rmtree(output_dir)
trainer.save_model('/kaggle/working/')