# Packages

In [1]:
from typing import Optional, Union
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel
from torch.utils.data import DataLoader

deberta_v3_large = '/kaggle/input/deberta-v3-large-hf-weights'

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Preparing the dataset

In [2]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch 

In [3]:
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
test_df['answer'] = 'A' # dummy answer that allows us to preprocess the test datataset using functionality that works for the train set

tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, 10, shuffle=False, collate_fn=data_collator)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/200 [00:00<?, ?ex/s]

# Predicting on the test set

We will do 3 iterations through the test set, at every iteration loading model weights I uploaded from a different training run and performing inference as we go.

In [4]:
%%time

all_preds_my_runs = []
for i in range(3):
    model = AutoModelForMultipleChoice.from_pretrained(f'/kaggle/input/science-exam-trained-model-weights/run_{i}').cuda()
    model.eval()
    preds = []
    for batch in test_dataloader:
        for k in batch.keys():
            batch[k] = batch[k].cuda()
        with torch.no_grad():
            outputs = model(**batch)
        preds.append(outputs.logits.cpu().detach())

    preds = torch.cat(preds)
    all_preds_my_runs.append(preds)

all_preds_my_runs = torch.stack(all_preds_my_runs)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


CPU times: user 48.1 s, sys: 7.06 s, total: 55.2 s
Wall time: 1min 39s


Now let us predict using the weights shared by `Hyc`.

In [5]:
model = AutoModelForMultipleChoice.from_pretrained(f'/kaggle/input/2023kagglellm-deberta-v3-large-model1').cuda()
model.eval()
preds = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    preds.append(outputs.logits.cpu().detach())

hyc_preds = torch.cat(preds)

In [6]:
all_preds_my_runs.shape, hyc_preds.shape

(torch.Size([3, 200, 5]), torch.Size([200, 5]))

# The Voting Ensemble

Let us now combine the predictions with the `voting ensemble` approach.

`hyc_preds` achieve a higher score on the LB. I will want my models to overrule `hyc_preds` only if they all agree.

In [7]:
from collections import defaultdict

voting_ensemble = defaultdict(list)

for i_preds in range(all_preds_my_runs.shape[0]):
    for row in range(all_preds_my_runs.shape[1]):
        preds = all_preds_my_runs[i_preds][row]
        voting_ensemble[row].append(preds.argsort(descending=True)[:3])

In [8]:
for row in range(hyc_preds.shape[0]):
    preds = hyc_preds[row]
    voting_ensemble[row].append(preds.argsort(descending=True)[:3])

For each example in the test set, we now have the top 3 predicted answers from each of our models:

Let us now combine the votes, giving more weight to the predictions from `hyc_weights`.

In [9]:
predictions = []
for i_preds in range(all_preds_my_runs.shape[1]):
    votes = defaultdict(lambda: 0)
    for preds in voting_ensemble[i_preds][:3]:
        votes[preds[0].item()] += 3
        votes[preds[1].item()] += 2
        votes[preds[2].item()] += 1
    hyc_preds = voting_ensemble[i_preds][3]
    votes[hyc_preds[0].item()] += 3 * 3.1 # never unseat top prediction by `hyc_weights` even with 3,3,3 from my weights
    votes[hyc_preds[1].item()] += 2 * 2.9 
    votes[hyc_preds[2].item()] += 1 * 2.9 
        
    predictions.append([t[0] for t in sorted(votes.items(), key=lambda x:x[1], reverse=True)][:3])

Now that we have carried out the "voting", let us combine the predictions from the `voting ensemble` in to a submission.

In [10]:
predictions[:5]

[[3, 1, 2], [0, 2, 3], [0, 2, 3], [2, 1, 0], [1, 0, 3]]

# Creating the submission

Let us now assign a letter corresponding to each predicted id (0 -> 'A', 1 -> 'B', etc). 

In [11]:
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions]
predictions_as_answer_letters[:3]

array([['D', 'B', 'C'],
       ['A', 'C', 'D'],
       ['A', 'C', 'D']], dtype='<U1')

And let us now go from this representation to outputting a string with 3 highest rated answers seperated by a space.

In [12]:
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]
predictions_as_string[:3]

['D B C', 'A C D', 'A C D']

And we are done! 🥳

Let us now output our submission.

In [13]:
submission = test_df[['id', 'prediction']]
submission.to_csv('submission.csv', index=False)

pd.read_csv('submission.csv').head()

Unnamed: 0,id,prediction
0,0,D B C
1,1,A C D
2,2,A C D
3,3,C B A
4,4,B A D


I hope you enjoyed this notebook!

**If you found this useful, please upvote 👉 [dataset where I share the weights from the trained models](https://www.kaggle.com/datasets/radek1/science-exam-trained-model-weights) 👈**

Thank you, appreciate your help! 🙏😊

Thank you for reading and happy Kaggling!