https://www.kaggle.com/code/eishkaran/kaggle-llm-science-exam

In [14]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForMultipleChoice,TrainingArguments,Trainer 
from typing import Optional, Union
import torch
from datasets import Dataset
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

In [16]:
df_valid = pd.read_csv('data/train.csv')
df_valid.head(3)

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A


In [None]:
from transformers import T5Tokenizer, T5Model

tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5Model.from_pretrained("t5-large")

In [6]:
# model_path = '/kaggle/input/transformers/t5-large'
# model      = T5ForConditionalGeneration.from_pretrained(model_path).cuda()
# tokenizer  = AutoTokenizer.from_pretrained(model_path)
## checking baseline score
valid_score = 0
model.eval()
for index in tqdm(range(df_valid.shape[0])):
    columns = df_valid.iloc[index].values
    scores = []
    input_ids = tokenizer(columns[1]+" <extra_id_0>", return_tensors="pt").input_ids.cuda()
    labels = tokenizer(["<extra_id_0> "+columns[2+p] for p in range(5)], return_tensors="pt", padding=True).input_ids
    minlen = np.min([len(l) for l in labels])
    for p in range(5):
        with torch.no_grad():
            loss = model(input_ids=input_ids, labels=labels[p][:minlen].unsqueeze(0).cuda()).loss.detach().cpu().numpy()
        scores.append(float(loss))
    predict = np.array(list("ABCDE"))[np.argsort(scores)][:3].tolist()
    if columns[7] in predict:
        valid_score += [1,0.5,0.333333333333][predict.index(columns[7])]
valid_score /= df_valid.shape[0]
print(f'score = {valid_score}')
# output: score = 0.5883

In [None]:
# Sorting answer for T-5 Model

df_test = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
model.eval()
submit_ids, submit_preds = [], []
for index in tqdm(range(df_test.shape[0])):
    columns = df_test.iloc[index].values
    scores = []
    input_ids = tokenizer(columns[1]+" <extra_id_0>", return_tensors="pt").input_ids.cuda()
    labels = tokenizer(["<extra_id_0> "+columns[2+p] for p in range(5)], return_tensors="pt", padding=True).input_ids
    minlen = np.min([len(l) for l in labels])
    for p in range(5):
        with torch.no_grad():
            loss = model(input_ids=input_ids, labels=labels[p][:minlen].unsqueeze(0).cuda()).loss.detach().cpu().numpy()
        scores.append(float(loss))
    submit_ids.append(columns[0])
    submit_preds.append(scores)

In [None]:
# Merging model with deberta version 3 

options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    """The example is expected to be a dictionary with keys 'prompt', 'A', 'B', 'C', 'D', 'E', and 'answer'."""
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = [example[option] for option in options]
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch   
        

training_args = TrainingArguments(
    output_dir='./',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to='none'
)

# Generally it's a bad idea to validate on your training set, but because our training set
# for this problem is so small we're going to train on all our data.

model_dir = '/kaggle/input/llm-sci-exam-deberta-large-run01'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
test_df.head()

# There are more verbose/elegant ways of doing this, but if we give our test set a random `answer` column
# we can make predictions directly with our trainer.
test_df['answer'] = 'A'

# Other than that we'll preprocess it in the same way we preprocessed test.csv
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

# Here we'll generate our "real" predictions on the test set
test_predictions = trainer.predict(tokenized_test_ds)

In [None]:
from sklearn.preprocessing import normalize

final_predictions = normalize(submit_preds)*0.5 + normalize(-test_predictions.predictions)*0.5

final_preds = [' '.join(np.array(list("ABCDE"))[np.argsort(s)][:3].tolist()) for s in final_predictions]
pd.DataFrame({'id':submit_ids,'prediction':final_preds}).to_csv('submission.csv', index=False)