In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling
import sys
import logging
logging.getLogger().setLevel(logging.ERROR)
logging.disable(sys.maxsize)

from torch.utils.data import *
from transformers import *
sys.path.insert(0, "..")

from models import *
from logic import *
from my_datasets import *

# from utils import *
import numpy as np
from tqdm import tqdm
import evaluate

from datasets import Dataset
import os

import wandb


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="transformer_friends"
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [3]:
# n, r = 5, 8
# n, r = 20, 5
n, r = 5, 8
ap, bp, tp, sp = 0.2, 0.2, 0.4, 0.1

nars = 3

train_len = 2500
test_len = 500
num_epochs = 10
seed = 42
# test_is_train = True

In [4]:
train_dataset = AutoRegKStepsEmbedsDataset(
    num_rules = r,
    num_vars = n,
    num_steps = nars,
    ante_prob = ap,
    conseq_prob = bp,
    state_prob = sp,
    dataset_len = train_len,
    seed = seed)

eval_dataset = AutoRegKStepsEmbedsDataset(
    num_rules = r,
    num_vars = n,
    num_steps = nars,
    ante_prob = ap,
    conseq_prob = bp,
    state_prob = sp,
    dataset_len = test_len,
    seed = seed)

In [5]:
train_dataset[0]

{'rules': tensor([[0, 1, 1, 0, 1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
         [0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
         [0, 1, 0, 1, 1, 0, 1, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
         [1, 0, 0, 0, 0, 1, 1, 1, 1, 0]]),
 'state': tensor([0, 0, 0, 0, 0]),
 'labels': tensor([[0, 0, 1, 0, 0],
         [0, 0, 1, 0, 1],
         [0, 0, 1, 1, 1]])}

In [6]:
def stringify_rule(rule, var_sep_token):
    """
    Create a rule of the form xi , xj , ... -> xa
    from a one-hot vector of [<ants>, <cons>]
    """

    n_vars = len(rule) // 2
    ants = [f"x{i}" for i in range(n_vars) if rule[i]]
    cons = [f"x{i}" for i in range(n_vars) if rule[n_vars+i]]
    if len(ants) < 1:
        ants = ["empty"]
    if len(cons) < 1:
        cons = ["empty"]
    rule = var_sep_token.join(ants) + " -> " + var_sep_token.join(cons)
    return rule

def get_string_rep_replace(dataset_item):
    """
    Returns a string of the form:
    [RULES_START] [RULE_START] ... [RULE_END] ... [RULES_END]
    [CURRENT_STATE_START] ... [CURRENT_STATE_END]
    [NEXT_STATE_START] ... [NEXT_STATE_END]
    """

    var_sep_token = " , "
    rules_start = "[RULES_START]"
    rules_end = "[RULES_END]"
    rule_start = "[RULE_START]"
    rule_end = "[RULE_END]"
    current_state_start = "[CURRENT_STATE_START]"
    current_state_end = "[CURRENT_STATE_END]"
    next_state_start = "[NEXT_STATE_START]"
    next_state_end = "[NEXT_STATE_END]"

    rules = dataset_item["rules"]
    current_state = dataset_item["state"]
    next_state = dataset_item["labels"][0]

    n_vars = len(current_state)

    rule_strs = [rule_start + " " + stringify_rule(rule, var_sep_token) + " " + rule_end for rule in rules]
    current_state_str = var_sep_token.join([f"x{i}" for i in range(n_vars) if current_state[i]])
    current_state_str = current_state_start + " " + current_state_str + " " + current_state_end
    rules_str = rules_start + " " + " ".join(rule_strs) + " " + rules_end

    next_state_str = var_sep_token.join([f"x{i}" for i in range(n_vars) if next_state[i]])
    return {
        "prompt": rules_str + " " + current_state_str + " " + next_state_start,
        "target": " " + next_state_str + " " + next_state_end,
        "stop": next_state_end
    }
    return rules_str + " " + current_state_str + " " + next_state_start, {"stop": next_state_end}

def get_string_rep_append(dataset_item):
    """
    Returns a string of the form:
    [RULES_START] [RULE_START] ... [RULE_END] ... [RULES_END]
    [STATES_START] [STATE_START] ... [STATE_END] ... [STATES_END]
    """

    var_sep_token = " , "
    rules_start = "[RULES_START]"
    rules_end = "[RULES_END]"
    rule_start = "[RULE_START]"
    rule_end = "[RULE_END]"
    states_start = "[STATES_START]"
    states_end = "[STATES_END]"
    state_start = "[STATE_START]"
    state_end = "[STATE_END]"

    rules = dataset_item["rules"]
    state = dataset_item["state"]
    next_states = dataset_item["labels"]

    n_vars = len(state)

    rule_strs = [rule_start + " " + stringify_rule(rule, var_sep_token) + " " + rule_end for rule in rules]
    state_str = var_sep_token.join([f"x{i}" for i in range(n_vars) if state[i]])
    state_str = state_start + " " + state_str + " " + state_end
    rules_str = rules_start + " " + " ".join(rule_strs) + " " + rules_end

    next_state_strs = [var_sep_token.join([f"x{i}" for i in range(n_vars) if next_state[i]]) for next_state in next_states]
    next_state_strs = [state_start + " " + next_state_str + " " + state_end for next_state_str in next_state_strs]
    next_state_strs = " ".join(next_state_strs)
    # Remove the first state_start from the next state string
    next_state_strs = next_state_strs[len(state_start)+1:]
    return {
        "prompt": rules_str + " " + states_start + " " + state_str + " " + state_start,
        "target": " " + next_state_strs + " " + states_end,
        "stop": states_end
    }
    return rules_str + " " + states_start + " " + state_str + " " + state_start, {"stop": states_end}


In [7]:
print(train_dataset[0])
print(get_string_rep_replace(train_dataset[0]))
print(get_string_rep_append(train_dataset[0]))

{'rules': tensor([[0, 1, 1, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
        [0, 1, 0, 1, 1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0, 1, 1, 1, 1, 0]]), 'state': tensor([0, 0, 0, 0, 0]), 'labels': tensor([[0, 0, 1, 0, 0],
        [0, 0, 1, 0, 1],
        [0, 0, 1, 1, 1]])}
{'prompt': '[RULES_START] [RULE_START] x1 , x2 , x4 -> empty [RULE_END] [RULE_START] x1 -> empty [RULE_END] [RULE_START] x2 -> x4 [RULE_END] [RULE_START] empty -> x2 [RULE_END] [RULE_START] x4 -> x2 , x3 [RULE_END] [RULE_START] x1 , x3 , x4 -> x1 [RULE_END] [RULE_START] empty -> x2 [RULE_END] [RULE_START] x0 -> x0 , x1 , x2 , x3 [RULE_END] [RULES_END] [CURRENT_STATE_START]  [CURRENT_STATE_END] [NEXT_STATE_START]', 'target': ' x2 [NEXT_STATE_END]', 'stop': '[NEXT_STATE_END]'}
{'prompt': '[RULES_START] [RULE_START] x1 , x2 , x4 -> empty [RULE_

In [8]:
# Create HuggingFace datasets for the append task

print("Creating train dataset")
train_data = [get_string_rep_append(train_dataset[i]) for i in tqdm(range(len(train_dataset)))]
train_hf_dataset = Dataset.from_dict({
    # "data": [train_data[i]['prompt'] for i in range(len(train_data))],
    # "label": [train_data[i]['target'] for i in range(len(train_data))],
    "data": [train_data[i]['prompt'] + train_data[i]['target'] for i in range(len(train_data))],
}).with_format("torch")

print("Creating test dataset")
test_data = [get_string_rep_append(eval_dataset[i]) for i in tqdm(range(len(eval_dataset)))]
test_hf_dataset = Dataset.from_dict({
    # "data": [test_data[i]['prompt'] for i in range(len(test_data))],
    # "label": [test_data[i]['target'] for i in range(len(test_data))],
    "data": [test_data[i]['prompt'] + test_data[i]['target'] for i in range(len(test_data))],
}).with_format("torch")

Creating train dataset


100%|██████████| 2500/2500 [00:03<00:00, 801.00it/s]


Creating test dataset


100%|██████████| 500/500 [00:00<00:00, 811.18it/s]


In [9]:
# Get the GPT-2 tokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [10]:
def tokenize_function(item):
    return tokenizer(item["data"], truncation=True)

train_tokenized_dataset = train_hf_dataset.map(tokenize_function, batched=True)
test_tokenized_dataset = test_hf_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|██████████| 2500/2500 [00:00<00:00, 5437.59 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6420.93 examples/s]


In [11]:
# Create the model
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [12]:
import inspect

In [13]:
inspect.signature(model.forward)

<Signature (input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithCrossAttentions]>

In [14]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Check if all predictions match labels
    acc = accuracy.compute(predictions=predictions, references=labels)
    return acc
    # return {"Accuracy" : acc["accuracy"], "Avg Ones" : avg_ones}

In [15]:
training_args = TrainingArguments(
    output_dir="gpt2_string_auto_reg_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=5,
    report_to="wandb",
    run_name="gpt2-autoreg-str-tokenizer_default-vars_5-rules_8-train_2500-test_500",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

In [16]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33makhare[0m ([33mtransformer_friends[0m). Use [1m`wandb login --relogin`[0m to force relogin




OutOfMemoryError: CUDA out of memory. Tried to allocate 1.64 GiB. GPU 0 has a total capacty of 10.75 GiB of which 229.62 MiB is free. Process 3504725 has 2.64 GiB memory in use. Including non-PyTorch memory, this process has 7.88 GiB memory in use. Of the allocated memory 5.37 GiB is allocated by PyTorch, and 1.70 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [33]:
wandb.finish()

