In [None]:
import json
import pandas as pd

def make_dataset(dir):
    # f1 = "working_60k/reranked.json"
    f1 = "working_6k_sci/reranked.json"
    f2 = "working_6k_nonsci/reranked.json"
    f3 = "working/reranked.json"

    columns = ['prompt', 'A', 'C', 'B', 'D', 'E', 'answer', 'tier_2_passages']
    
    train = [f1, f2]
    validation = [f3]
    train_dfs = [pd.DataFrame.from_records(json.load(open(f))) for f in train]
    train_df = pd.concat(train_dfs)
    train_df = train_df[columns]
    train_df.to_csv(f"{dir}/train.csv", index=False)

    val_dfs = [pd.DataFrame.from_records(json.load(open(f))) for f in validation]
    val_df = pd.concat(val_dfs)
    val_df = val_df[columns]
    # val_df['wikipedia_excerpt'] = None
    val_df.to_csv(f"{dir}/validation.csv", index=False)

make_dataset("kaggle_sci_qa/")

In [None]:
from datasets import load_dataset
swag = load_dataset("kaggle_sci_qa")

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
model_path = "sileod/deberta-v3-large-tasksource-nli"
# model_path = "deberta_ft/checkpoint-3005/"
# model_path = "microsoft/deberta-v3-base"

In [None]:
swag["train"][10]

While it looks like there are a lot of fields here, it is actually pretty straightforward:

- `sent1` and `sent2`: these fields show how a sentence starts, and if you put the two together, you get the `startphrase` field.
- `ending`: suggests a possible ending for how a sentence can end, but only one of them is correct.
- `label`: identifies the correct sentence ending.

## Preprocess

The next step is to load a BERT tokenizer to process the sentence starts and the four possible endings:

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
model = AutoModelForMultipleChoice.from_pretrained(model_path, ignore_mismatched_sizes=True)

In [None]:
import random
ABCDE = "ABCDE"
ending_names = [str(i) for i in range(5)]


def preprocess_function(examples):
    first_sentences = [[context] * 5 for context in examples["context"]]
    
    question_headers = examples["prompt"]
    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
    ]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # print(first_sentences, second_sentences)
    
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    return {k: [v[i : i + 5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}

# def keep_only_context(example):
#     return len(example['support']) > 0

def make_labels(example):
    answer = example['answer']
    example['label'] = ABCDE.index(answer)
    for i in range(5):
        example[str(i)] = example[ABCDE[i]]
    return example

def is_bad_passage(passage):
    words = passage.split(" ")
    lines = passage.split("\n")
    n_words_per_line = len(words) / len(lines)
    isbad = (len(lines) >= 5) and (n_words_per_line <= 5)
    if passage.count("|") > 10:
        isbad = True
    return isbad

def make_context(example):
    contexts = example["tier_2_passages"]
    contexts = eval(contexts)
    
    # extra_context = example.get('wikipedia_excerpt', '')
    # if extra_context:
    #     extra_title = extra_context.split(":", maxsplit=1)[0]
    #     extra_passage = extra_context.split(":", maxsplit=1)[1]
    
    #     extra_context = {'title':extra_title, 'passage':extra_passage}
    #     # print(extra_context)
    #     contexts = [extra_context] + contexts
    #     # contexts = [extra_context]
    
    openbook = ""
    max_openbook_len = 1024
    for context in contexts:
        tokens = tokenizer.encode(openbook)
        if len(tokens) > max_openbook_len:
            break
        passage = context['passage']
        if is_bad_passage(passage):
            continue
        title = context['title']
        passage = passage.replace("\n", " ")
        lpassage = len(tokenizer.encode(passage))
        if lpassage > 512:
            print(passage)
            continue
        if lpassage + len(tokens) > max_openbook_len:
            continue
        openbook += f"""{title}: {passage}\n"""
        
    
    example['context'] = openbook
    return example

In [None]:
swag_context = swag.map(make_context)
swag_clean = swag_context.map(make_labels)
tokenized_swag = swag_clean.map(preprocess_function, batched=True)

In [None]:
maxlen = 0
badguy = None
for elem in tokenized_swag['train']:
    for ids in elem['input_ids']:
        maxlen = max(len(ids), maxlen)
        if len(ids) > 3000:
            badguy = elem
print(maxlen)

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
# import evaluate

# accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def map_at_k(predictions, labels, k):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:k]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip(list(range(1,k+1)),x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(eval_pred):
    # predictions, labels = eval_pred
    # predictions = np.argmax(predictions, axis=1)
    # acc = accuracy.compute(predictions=predictions, references=labels)

    predictions = eval_pred.predictions.tolist()
    labels = eval_pred.label_ids.tolist()
    return {"map@3": map_at_k(predictions, labels, 3), "map@1": map_at_k(predictions, labels, 1)}

In [None]:
requires_grad = False
layer_number = 0
for name, param in model.deberta.named_parameters():
    try:
        layer_number = name.split(".")[2]
        layer_number = int(layer_number)
        if layer_number > 23:
            requires_grad = True
    except:
        pass

    param.requires_grad = requires_grad

    # print(name, layer_number, "trainable:", param.requires_grad, param.numel())

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || "
        f"all params: {all_param} || "
    )

print_trainable_parameters(model)

In [None]:
training_args = TrainingArguments(
    output_dir="deberta_ft2",
    save_strategy="epoch",
    # optim='adamw_bnb_8bit',
    # max_grad_norm=0.3,
    warmup_ratio=0.03,
    # load_best_model_at_end=True,
    # gradient_checkpointing=True,
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=1,
    logging_steps=50,
    eval_steps=100,
    evaluation_strategy='steps',
    max_steps=61000,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    push_to_hub=False,
    fp16=True,
    tf32=True,
    report_to="none"
)

tokenized_swag = tokenized_swag.shuffle()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_swag["train"],
    eval_dataset=tokenized_swag["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
tokens = tokenizer(["capital of paris is france. what is capital of france? paris", "capital of paris is france. what is capital of france? delhi"], return_tensors='pt')

In [None]:
test_predictions = trainer.predict(tokenized_swag["validation"]).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

In [None]:
predictions_as_answer_letters

In [None]:
val = pd.read_csv('kaggle_sci_qa/validation.csv')

In [None]:
# val.to_dict(orient='records')

In [None]:
print(swag_clean['validation'][0]['context'])