In [None]:
drive_mount = ''
data_dir=''

In [None]:
from google.colab import drive
drive.mount(drive_mount)

In [None]:
import os
os.chdir(data_dir)

In [None]:
# !pwd

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 500)

In [None]:
train_df = pd.read_csv("train.csv")

In [None]:
# train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.iloc[0]['prompt']

In [None]:
eval(train_df.iloc[0]['prompt'])

In [None]:
train_df.iloc[0]['response_a']

In [None]:
train_df.iloc[0]['response_b']

In [None]:
## Categorize the winner model result
train_df['winner_result'] = train_df[['winner_model_a','winner_model_b','winner_tie']].idxmax(axis=1)
train_df['winner_model_name'] = np.where(train_df['winner_result']=='winner_model_a',train_df['model_a'],
                                         np.where(train_df['winner_result']=='winner_model_b',train_df['model_b'],'winner_tie'))
train_df['label'] = np.where(train_df['winner_result'] == 'winner_model_a', 0,
                             np.where(train_df['winner_result']=='winner_model_b',1,2))

In [None]:
train_df.head()

In [None]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: eval(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: eval(x.replace("null","np.nan")))
train_df['response_b'] = train_df['response_b'].apply(lambda x:eval(x.replace("null","np.nan")))

In [None]:
train_df = train_df.explode(['prompt', 'response_a','response_b'])

In [None]:
train_df.head()

# EAD
* Encode failure: Some model fail in the middle of a prompt session, but still become the winner model


In [None]:
train_df['RowNum'] = train_df.groupby('id').cumcount() + 1
train_df['session'] = train_df['id'].astype(str) + '_' + train_df['RowNum'].astype(str)
train_df.head()

In [None]:
train_df.loc[train_df['response_b'].isnull()].head()

In [None]:
train_df.loc[train_df['id'] == 57180984]

In [None]:
train_df.loc[train_df['id'] == 134445396]

In [None]:
train_df['encode_fail_a'] = np.where(train_df['response_a'].isnull(),1,0)
train_df['encode_fail_b'] = np.where(train_df['response_b'].isnull(),1,0)

In [None]:
# train_df.groupby('RowNum')[['encode_fail_a','encode_fail_b']].value_counts() ## Does not seems like longer session result in more failure

In [None]:
train_df['max_session'] = train_df.groupby('id')['RowNum'].transform('max')

In [None]:
## Some model fail in the middle session but still won
# train_df.query("encode_fail_a == 1 or encode_fail_b == 1 and max_session>RowNum") #.groupby('winner_model_name').count()

In [None]:
## Which model is more likely to fail?
train_df['fail_model'] = np.where((train_df['encode_fail_a']==1) & (train_df['encode_fail_b']==1),'both_fail',
                                  np.where((train_df['encode_fail_a']==1) & (train_df['encode_fail_b']==0), train_df['model_a'],
                                  np.where((train_df['encode_fail_a']==0) & (train_df['encode_fail_b']==1), train_df['model_b'], 'none_fail' )))




In [None]:
train_df['fail_model'].value_counts(normalize=True).iloc[1:].plot(kind='bar')

In [None]:
## Which two models are more likely to be compared together?

In [None]:
## Lets remove the data where both model fail and the result is not tie
train_df = train_df.drop(train_df.query("fail_model == 'both_fail' and label!=2").index,axis=0)
train_df.head()

In [None]:
# train_df.loc[train_df['response_a'].isnull()]

In [None]:
train_df.loc[train_df['id']==16350735][['response_a']]

# We can treat the problem like a multiple choice classification problem

In [None]:

def get_options(row):
  if row['response_a']=='NaN':
    res_a = ''
  else:
    res_a = row['response_a']

  if row['response_b']=='NaN':
    res_b = ''
  else:
    res_b = row['response_b']
  return [res_a,res_b,'both']

train_df['options'] = train_df.apply(lambda x:get_options(x),axis=1)

In [None]:
train_df.head()

# Train a model for the multiple choice classification

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForMultipleChoice, get_scheduler
# from transformers.optimization import AdamW
from torch.optim import AdamW
from tqdm import tqdm
import torch.nn.functional as F

In [None]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMultipleChoice.from_pretrained("bert-base-uncased")

In [None]:
train_examples = train_df[['session','prompt','options','label']].to_dict(orient='records')

In [None]:




# Dataset class
class MultipleChoiceDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=128):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        prompt = example["prompt"]
        options = example["options"]
        label = example["label"]

        # Encode each (context + choice) spair
        inputs = self.tokenizer(
            [str(prompt) + " " + str(option) for option in options],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Shape: (num_choices, seq_len)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(label)
        }





In [None]:
dataset = MultipleChoiceDataset(train_examples, tokenizer)
dataloader = DataLoader(dataset, batch_size=20)

In [None]:
dataset.examples[0]

In [None]:
# Create dataset and dataloader


# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(dataloader) * 3  # epochs = 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(3):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)       # shape: (batch_size, num_choices, seq_len)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


In [None]:
torch.save(model, './entire_model.pth')

In [None]:
# dataset.examples['prompt']

In [None]:
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def predict_with_probs(prompt, options):
    # Prepare input
    inputs = tokenizer(
        [str(prompt) + " " + str(option) for option in options],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Reshape to (1, num_choices, seq_len)
    input_ids = inputs["input_ids"].unsqueeze(0).to(device)
    attention_mask = inputs["attention_mask"].unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # shape: (1, num_choices)
        probs = F.softmax(logits, dim=1)  # softmax over choices
        predicted_index = torch.argmax(probs, dim=1).item()

    prob_values = probs[0].cpu().tolist()

    # Return choice index, answer text, and probabilities
    return {
        "predicted_index": predicted_index,
        "predicted_choice": options[predicted_index],
        "probabilities": list(zip(options, prob_values))
    }


In [None]:
prompt = train_df.iloc[0]['prompt']
options= train_df.iloc[0]['options']

result = predict_with_probs( prompt, options)

print(f"Predicted choice: {result['predicted_choice']} (index {result['predicted_index']})")
print("Probabilities:")
for i, (choice, prob) in enumerate(result["probabilities"]):
    print(f"option {i}: {prob:.4f} \n {choice}")

In [None]:
result['probabilities'][2][1]

In [None]:
def get_result(row):
  result = predict_with_probs(row['prompt'],row['options'])


  return {'predicted_label': result['predicted_index'], 'predicted_prefered_response': result['predicted_choice'], 'predicted_prob_a':result['probabilities'][0][1], 'predicted_prob_b':result['probabilities'][1][1], 'predicted_prob_both':result['probabilities'][2][1] }

In [None]:
train_df[['pred_label','pred_response','pred_a','pred_b','pred_both']] = train_df.apply(lambda x:get_result(x), axis=1, result_type='expand')


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, classification_report
def get_performance(pred, truth):
  cm = confusion_matrix(truth, pred)
  accuracy = accuracy_score(truth, pred)
  precision = precision_score(truth, pred, average='weighted')
  recall = recall_score(truth, pred, average='weighted')
  f1 = f1_score(truth, pred, average='weighted')
  target_names = ['response_a', 'response_b', 'tied']
  print(classification_report(truth, pred, target_names=target_names))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
  disp.plot()
  plt.show()
  return cm, accuracy, precision, recall, f1


In [None]:
cm, accuracy, precision, recall, f1 = get_performance(train_df['pred_label'],train_df['label'])

In [None]:
accuracy