### Imports

In [70]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
import datasets
from functools import partial
import pandas as pd
from transformers import GPT2Tokenizer, GPTNeoForSequenceClassification, AutoTokenizer, OPTForCausalLM
import torch
import numpy as np

In [72]:
import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path) 

In [73]:
from models.judge import train_judge_for_babi

In [74]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [75]:
from utils import set_seed
set_seed(62)

In [76]:
from constants import FALSE_LABEL_STR, TRUE_LABEL_STR

id2label = {0: FALSE_LABEL_STR, 1: TRUE_LABEL_STR}
label2id = {FALSE_LABEL_STR: 0, TRUE_LABEL_STR: 1}

# Load Dataset

In [77]:
multirc = datasets.load_dataset('eraser_multi_rc')

Using the latest cached version of the module from /home/felix/.cache/huggingface/modules/datasets_modules/datasets/eraser_multi_rc/88c60ad5598eeedb78e952696a3735bf7e6efe34ac06577c4f20c0eccef78a33 (last modified on Tue Aug 15 10:26:08 2023) since it couldn't be found locally at eraser_multi_rc., or remotely on the Hugging Face Hub.
Found cached dataset eraser_multi_rc (/home/felix/.cache/huggingface/datasets/eraser_multi_rc/default/0.1.1/88c60ad5598eeedb78e952696a3735bf7e6efe34ac06577c4f20c0eccef78a33)


  0%|          | 0/3 [00:00<?, ?it/s]

In [78]:
multirc_train = pd.DataFrame(multirc["train"])

In [79]:
len(multirc_train)

24029

In [57]:
multirc_val = pd.DataFrame(multirc["validation"])

In [58]:
import re

def remove_superfluous_spaces(sentence):
    # Define the regular expression pattern
    pattern = r'\s+([,.\'\"\?])'
    
    # Replace the pattern with the matched character without extra spaces
    cleaned_sentence = re.sub(pattern, r'\1', sentence)
    
    return cleaned_sentence

In [60]:
def generate_prompt(context, question, answer):
    prompt = f"Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:\n{answer}"
    return prompt

In [61]:
def get_easy_prompt_from_row(row):
    evidences = [remove_superfluous_spaces(e) for e in row['evidences']]
    context = ' '.join(evidences)
    question, answer = [
        remove_superfluous_spaces(s) for s in row["query_and_answer"].split(" || ")
    ]
    return generate_prompt(context, question, answer)

In [62]:
def generate_easy_multi_rc_data(data):
    easy_mrc_data = data.copy()
    easy_mrc_data['prompt'] = easy_mrc_data.apply(
        get_easy_prompt_from_row, axis=1
    )
    easy_mrc_data = easy_mrc_data[["prompt", "label"]]
    return easy_mrc_data

In [63]:
def get_hard_prompt_from_row(row):
    context = remove_superfluous_spaces(row["passage"])
    question, answer = [
        remove_superfluous_spaces(s) for s in row["query_and_answer"].split(" || ")
    ]
    return generate_prompt(context, question, answer)

In [64]:
def generate_hard_multi_rc_data(data):
    hard_mrc_data = data.copy()
    hard_mrc_data['prompt'] = hard_mrc_data.apply(
        get_hard_prompt_from_row, axis=1
    )
    return hard_mrc_data

In [67]:
easy_mrc_train = generate_easy_multi_rc_data(multirc_train)
easy_mrc_val = generate_easy_multi_rc_data(multirc_val)
hard_mrc_train = generate_hard_multi_rc_data(multirc_train)
hard_mrc_val = generate_hard_multi_rc_data(multirc_val)

In [None]:
easy_mrc_train.to_csv("../data/processed/easy_mrc_train.csv", index=False)
easy_mrc_val.to_csv("../data/processed/easy_mrc_val.csv", index=False)
hard_mrc_train.to_csv("../data/processed/hard_mrc_train.csv", index=False)
hard_mrc_val.to_csv("../data/processed/hard_mrc_val.csv", index=False)

# Train Judge

In [None]:
int8_training = True  # https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/
lora_training = True  # https://github.com/microsoft/LoRA
autocast_training = True  # Trains with quantized weights. Only use if your hardware doesn't support int8_training

In [None]:
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, 
            label2id=label2id, load_in_8bit=int8_training, low_cpu_mem_usage=int8_training)


In [None]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

In [None]:
run_name = "gpt-neo-1.3B"
project_name = "multiRC-Judge"
store_locally = False  # Set False if you want to delete any config + checkpoint files in models/ (doesn't delete from subdirectories)

batch_size = 16
lr = 5e-5
lr_scheduler = None  # "cosine-annealing" | None

epochs = 10
acc_every_batch = 250
eval_every_batch = 250
save_every_epoch = 1

In [None]:
train_judge_for_multirc(
    model=model,
    tokenizer=tokenizer,
    model_name=model_name,
    run_name=run_name,
    project_name=project_name,
    device=device,
    lr=lr,
    lr_scheduler=lr_scheduler,
    autocast_training=autocast_training,
    int8_training=int8_training,
    lora_training=lora_training,
    batch_size=batch_size,
    store_locally=store_locally,
    epochs=epochs,
    acc_every_batch=acc_every_batch,
    eval_every_batch=eval_every_batch,
    save_every_epoch=save_every_epoch,
)