# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
from functools import partial
import pandas as pd
from transformers import GPT2Tokenizer, GPTNeoForSequenceClassification, AutoTokenizer, OPTForCausalLM
import torch
import numpy as np

[2023-08-15 09:40:27,541] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path) 

In [4]:
from models.judge import train_judge_for_babi

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from utils import set_seed
set_seed(62)

In [7]:
from constants import FALSE_LABEL_STR, TRUE_LABEL_STR

id2label = {0: FALSE_LABEL_STR, 1: TRUE_LABEL_STR}
label2id = {FALSE_LABEL_STR: 0, TRUE_LABEL_STR: 1}

# Load Dataset

In [8]:
babi = datasets.load_dataset('Muennighoff/babi')

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
babi_train = pd.DataFrame(babi["train"])

In [10]:
babi_val = pd.DataFrame(babi["validation"])

In [11]:
babi_train.head()

Unnamed: 0,passage,question,answer,task
0,Mary moved to the bathroom.\nJohn went to the ...,Where is Mary?,bathroom,1
1,Mary moved to the bathroom.\nJohn went to the ...,Where is Daniel?,hallway,1
2,Mary moved to the bathroom.\nJohn went to the ...,Where is Daniel?,hallway,1
3,Mary moved to the bathroom.\nJohn went to the ...,Where is Daniel?,office,1
4,Mary moved to the bathroom.\nJohn went to the ...,Where is Sandra?,bathroom,1


In [12]:
def get_task_string(task_n, idx=0, with_answer=False):
    passage = babi_train[babi_train["task"] == task_n].iloc[idx]["passage"]
    question = babi_train[babi_train["task"] == task_n].iloc[idx]["question"]
    prompt = f"Passage:\n{passage}\nQuestion:\n{question}\n\nAnswer:\n"
    if with_answer:
        answer = babi_train[babi_train["task"] == task_n].iloc[idx]["answer"]
        prompt = f"{prompt}{answer}"
    return prompt

# Preprocess Data

In [13]:
def generate_task_1_variations(question, correct_answer, possible_names=[]):
    t1_possible_answers = ['hallway', 'garden', 'office', 'kitchen', 'bedroom', 'bathroom']

    correct_answers = [correct_answer]
    name = question.split(" ")[2].split("?")[0]
    correct_answers.append(f"{name} is in the {correct_answer}")
    correct_answers.append(f"In the {correct_answer}")

    incorrect_answers = []
    possible_incorrect_answers = list(t1_possible_answers)
    possible_incorrect_answers.remove(correct_answer)
    incorrect_answer = np.random.choice(possible_incorrect_answers, size=1)[0]

    incorrect_answers.append(incorrect_answer)
    incorrect_answers.append(f"{name} is in the {incorrect_answer}")


    r = np.random.random()

    if r < 0.75:
        incorrect_answers.append(f"In the {incorrect_answer}")
    elif r < 0.95:
        possible_false_names = list(possible_names)
        possible_false_names.remove(name)
        false_name = np.random.choice(possible_false_names, size=1)[0]
        incorrect_answers.append(f"{false_name} is in the {correct_answer}")
    else:
        incorrect_answers.append("")

    return correct_answers, incorrect_answers


In [14]:
def generate_task_2_variations(question, correct_answer, possible_answers=[], possible_obj=[]):
    correct_answers = [correct_answer]
    obj = question.split(" ")[3].split("?")[0]
    correct_answers.append(f"{obj} is in the {correct_answer}")
    correct_answers.append(f"In the {correct_answer}")

    incorrect_answers = []
    possible_incorrect_answers = list(possible_answers)
    possible_incorrect_answers.remove(correct_answer)
    incorrect_answer = np.random.choice(possible_incorrect_answers, size=1)[0]

    incorrect_answers.append(incorrect_answer)
    incorrect_answers.append(f"The {obj} is in the {incorrect_answer}")


    r = np.random.random()

    if r < 0.75:
        incorrect_answers.append(f"In the {incorrect_answer}")
    elif r < 0.95:
        possible_false_obj = list(possible_obj)
        possible_false_obj.remove(obj)
        false_obj = np.random.choice(possible_false_obj, size=1)[0]
        incorrect_answers.append(f"The {false_obj} is in the {correct_answer}")
    else:
        incorrect_answers.append("")

    return correct_answers, incorrect_answers


In [15]:
def generate_task_3_variations(question, correct_answer, possible_answers=[], possible_obj=[]):
    correct_answers = [correct_answer]
    obj = question.split("the ")[1].split(" before")[0]
    correct_answers.append(f"{obj} was in the {correct_answer}")
    correct_answers.append(f"In the {correct_answer}")

    incorrect_answers = []
    possible_incorrect_answers = list(possible_answers)
    possible_incorrect_answers.remove(correct_answer)
    incorrect_answer = np.random.choice(possible_incorrect_answers, size=1)[0]

    incorrect_answers.append(incorrect_answer)
    incorrect_answers.append(f"The {obj} was in the {incorrect_answer}")


    r = np.random.random()

    if r < 0.75:
        incorrect_answers.append(f"In the {incorrect_answer}")
    elif r < 0.95:
        possible_false_obj = list(possible_obj)
        possible_false_obj.remove(obj)
        false_obj = np.random.choice(possible_false_obj, size=1)[0]
        incorrect_answers.append(f"The {false_obj} was in the {correct_answer}")
    else:
        incorrect_answers.append("")

    return correct_answers, incorrect_answers


In [16]:
def generate_task_4_variations(question, correct_answer, possible_answers=[]):
    t4_directions = ["north", "east", "south", "west"]

    correct_answers = [correct_answer]
    correct_answers.append(f"The {correct_answer}")
    direction = [d for d in t4_directions if d in question][0]
    place = [p for p in possible_answers if p in question][0]
    if "of" in question.split(" ")[-1]:
        correct_answers.append(
            f"The {place} is {direction} of {correct_answer}"
        )
    else:
        correct_answers.append(
            f"The {correct_answer} is {direction} of the {place}"
        )
    
    
    incorrect_answers = []
    possible_incorrect_answers = list(possible_answers)
    possible_incorrect_answers.remove(correct_answer)
    incorrect_answer = np.random.choice(possible_incorrect_answers, size=1)[0]

    incorrect_answers.append(incorrect_answer)
    
    if "of" in question.split(" ")[-1]:
        incorrect_answers.append(
            f"The {correct_answer} is {direction} of the {place}"
        )
    else:
        incorrect_answers.append(
            f"The {place} is {direction} of {correct_answer}"
        )

    r = np.random.random()

    if r < 0.95:
        incorrect_answers.append(f"The {incorrect_answer}")
    else:
        incorrect_answers.append("")

    return correct_answers, incorrect_answers

In [17]:
def get_prompt(passage, question, answer):
    prompt = f"Passage:\n{passage}\nQuestion:\n{question}\n\nAnswer:\n{answer}"
    return prompt

In [18]:
def get_babi_df(data):
    passages = []
    questions = []
    answers = []
    prompts = []
    labels = []
    task_types = []

    t1_questions = data[data["task"] == 1]["question"].value_counts().index.to_list()
    t1_names = [question.split(" ")[2].split("?")[0] for question in t1_questions]

    t2_questions = data[data["task"] == 2]["question"].value_counts().index.to_list()
    t2_objects = list(set([question.split(" ")[3].split("?")[0] for question in t2_questions]))
    t2_possible_answers = data[data["task"] == 2]["answer"].value_counts().index.to_list()

    t3_questions = data[data["task"] == 3]["question"].value_counts().index.to_list()
    t3_objects = list(set([question.split("the ")[1].split(" before")[0] for question in t3_questions]))
    t3_possible_answers = data[data["task"] == 3]["answer"].value_counts().index.to_list()

    t4_possible_answers = data[data["task"] == 4]["answer"].value_counts().index.to_list()

    for task in range(1,5):
        fn = {
            1 : partial(generate_task_1_variations, possible_names=t1_names),
            2 : partial(generate_task_2_variations, possible_answers=t2_possible_answers, possible_obj=t2_objects),
            3 : partial(generate_task_3_variations, possible_answers=t3_possible_answers, possible_obj=t3_objects),
            4 : partial(generate_task_4_variations, possible_answers=t4_possible_answers)
        }[task]
        for idx, row in data[data["task"] == task].reset_index(drop=True).iterrows():
            passages.extend([row["passage"]] * 6)
            questions.extend([row["question"]] * 6)
            correct_answers, incorrect_answers = fn(row["question"], row["answer"])
            answers.extend(correct_answers)
            answers.extend(incorrect_answers)
            labels.extend([1] * 3)
            labels.extend([0] * 3)
            prompts.extend(
                get_prompt(row["passage"], row["question"], answer)
                for answer in correct_answers + incorrect_answers
            )
            task_types.extend([task] * 6)

    df = pd.DataFrame({
        "passage": passages,
        "question": questions,
        "answer": answers,
        "task_type": task_types,
        "prompt": prompts,
        "label": labels
    })
    return df
        

In [19]:
data_train = get_babi_df(babi_train)

In [20]:
data_train.to_csv("../data/processed/babi_data_small_train.csv", index=False)

In [21]:
data_val = get_babi_df(babi_val)

In [22]:
data_val.to_csv("../data/processed/babi_data_small_val.csv", index=False)

# Train Judge

In [8]:
int8_training = True  # https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/
lora_training = True  # https://github.com/microsoft/LoRA
autocast_training = True  # Trains with quantized weights. Only use if your hardware doesn't support int8_training

In [10]:
model_name = "xhyi/PT_GPTNEO350_ATG"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, 
            label2id=label2id, load_in_8bit=int8_training, low_cpu_mem_usage=int8_training)

if not int8_training:
    model = model.to(device)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at xhyi/PT_GPTNEO350_ATG and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 1024)

In [13]:
run_name = "gpt-neo-350M"
project_name = "bAbi-Judge"
store_locally = False  # Set False if you want to delete any config + checkpoint files in models/ (doesn't delete from subdirectories)

batch_size = 16
lr = 5e-5
lr_scheduler = None  # "cosine-annealing" | None

epochs = 5
acc_every_batch = 250
eval_every_batch = 250
save_every_epoch = 1

In [14]:
train_judge_for_babi(
    model=model,
    tokenizer=tokenizer,
    model_name=model_name,
    run_name=run_name,
    project_name=project_name,
    device=device,
    lr=lr,
    lr_scheduler=lr_scheduler,
    autocast_training=autocast_training,
    int8_training=int8_training,
    lora_training=lora_training,
    batch_size=batch_size,
    store_locally=store_locally,
    epochs=epochs,
    acc_every_batch=acc_every_batch,
    eval_every_batch=eval_every_batch,
    save_every_epoch=save_every_epoch,
)

Memory used before creating dataloaders: 0.614098944
Memory used after creating dataloaders: 0.614098944
Memory used before optimization: 0.614098944
Memory used after preparing for int8 training: 0.614098944


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Memory used after lora: 0.614098944


[34m[1mwandb[0m: Currently logged in as: [33mfelixahofstaetter[0m ([33mdetecting-and-mitigating-deception[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


In [None]:
import wandb
wandb.finish()