In [1]:
from datasets import load_dataset, Dataset

In [2]:
test_ = load_dataset('csv', data_files = '/kaggle/input/lmsys-chatbot-arena/test.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
import numpy as np

def process_text(text):
    text = ' '.join(text.split('"')[1:-1])
    return text

def prepare_data(sample):
    sample['pa'] = '<prompt> ' + process_text(sample['prompt']) + ' <response> ' + process_text(sample['response_a'])
    sample['pb'] = '<prompt> ' + process_text(sample['prompt']) + ' <response> ' + process_text(sample['response_b'])
    return sample

In [4]:
test = test_['train'].map(prepare_data).remove_columns(['id','prompt','response_a','response_b'])

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_ckpt = '/kaggle/input/lmsys-train'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_new_model():
    return AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = 3).to(device)

In [6]:
def process(batch):
    return tokenizer(batch['pa'], batch['pb'], truncation = True, padding = 'max_length', max_length = 512)

test_ds = test.map(process, batched = True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [7]:
model = get_new_model()

In [8]:
torch.cuda.empty_cache()

In [9]:
%%time

batch_size = 32
outputs = []

for i in range(0, len(test_ds), batch_size):
    start  = i; end =  min(i+batch_size, len(test_ds))
    batch = test_ds[start:end]
    input_ids = torch.tensor(batch['input_ids']).to(device)
    attention_mask = torch.tensor(batch['attention_mask']).to(device)
    with torch.no_grad():
        output = model(input_ids = input_ids, attention_mask = attention_mask)
        outputs.extend(output.logits)

CPU times: user 146 ms, sys: 116 ms, total: 262 ms
Wall time: 771 ms


In [10]:
preds = torch.softmax(torch.stack(outputs), dim = -1)

preds

tensor([[0.3027, 0.3894, 0.3079],
        [0.3694, 0.3471, 0.2834],
        [0.2679, 0.2996, 0.4325]], device='cuda:0')

In [11]:
import pandas as pd

sub = pd.DataFrame(preds[:, [0,2,1]].cpu(), columns = ['winner_model_a','winner_model_b', 'winner_tie'])
sub.insert(0, 'id', test_['train']['id'])

sub.to_csv('submission.csv', index = False)