In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import sys
import logging
logging.getLogger().setLevel(logging.ERROR)
logging.disable(sys.maxsize)

from torch.utils.data import *
from transformers import *
sys.path.insert(0, "..")

from models import *
from logic import *
from my_datasets import *

from utils import *
import numpy as np
from tqdm import tqdm
import evaluate

from datasets import Dataset
import os

import wandb


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="transformer_friends"
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [3]:
# n, r = 5, 8
n, r = 20, 5
ap, bp, tp, sp = 0.2, 0.2, 0.4, 0.1

nars = 3

train_len = 2500
test_len = 500
num_epochs = 10
# test_is_train = True

In [4]:
### Datasets
qed_train_dataset_config = OneShotQedDatasetConfig(r,n,ap,bp,tp,dataset_len=train_len,seed=1234)
qed_test_dataset_config = OneShotQedDatasetConfig(r,n,ap,bp,tp,dataset_len=test_len,seed=2345)
qed_train_dataset = OneShotQedEmbedsDataset(qed_train_dataset_config)
qed_test_dataset = OneShotQedEmbedsDataset(qed_test_dataset_config)

succ_train_dataset_config = OneStepStateDatasetConfig(r,n,ap,bp,tp,dataset_len=train_len,seed=1234)
succ_test_dataset_config = OneStepStateDatasetConfig(r,n,ap,bp,tp,dataset_len=test_len,seed=2345)
succ_train_dataset = OneStepStateEmbedsDataset(succ_train_dataset_config)
succ_test_dataset = OneStepStateEmbedsDataset(succ_test_dataset_config)

ars_train_dataset_config = AutoRegFixedStepsDatasetConfig(r,n,ap,bp,sp,nars,dataset_len=train_len,seed=1234)
ars_test_dataset_config = AutoRegFixedStepsDatasetConfig(r,n,ap,bp,sp,nars,dataset_len=test_len,seed=2345)
ars_train_dataset = AutoRegFixedStepsEmbedsDataset(ars_train_dataset_config)
ars_test_dataset = AutoRegFixedStepsEmbedsDataset(ars_test_dataset_config)

In [5]:
def stringify_rule(rule, var_sep_token):
    """
    Create a rule of the form xi , xj , ... -> xa
    from a one-hot vector of [<ants>, <cons>]
    """

    n_vars = len(rule) // 2
    ants = [f"x{i}" for i in range(n_vars) if rule[i]]
    cons = [f"x{i}" for i in range(n_vars) if rule[n_vars+i]]
    if len(ants) < 1:
        ants = ["empty"]
    if len(cons) < 1:
        cons = ["empty"]
    rule = var_sep_token.join(ants) + " -> " + var_sep_token.join(cons)
    return rule

def get_string_rep(dataset_item):
    """
    Returns a string of the form:
    [RULES_START] [RULE_START] ... [RULE_END] ... [RULES_END]
    [THEOREM_START] ... [THEOREM_END]
    [QED]
    """

    # Define the placeholder tokens
    var_sep_token = " , "
    rules_start = "[RULES_START]"
    rules_end = "[RULES_END]"
    rule_start = "[RULE_START]"
    rule_end = "[RULE_END]"
    theorem_start = "[THEOREM_START]"
    theorem_end = "[THEOREM_END]"
    qed = "[QED]"

    rules = dataset_item["rules"]
    theorem = dataset_item["theorem"]

    n_vars = len(theorem)

    rule_strs = [rule_start + " " + stringify_rule(rule, var_sep_token) + " " + rule_end for rule in rules]
    theorem_str = var_sep_token.join([f"x{i}" for i in range(n_vars) if theorem[i]])
    theorem_str = theorem_start + " " + theorem_str + " " + theorem_end
    rules_str = rules_start + " " + " ".join(rule_strs) + " " + rules_end
    return rules_str + " " + theorem_str + " " + qed


In [6]:
print(qed_train_dataset[0])
print(get_string_rep(qed_train_dataset[0]))

{'rules': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
         0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
         1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'theorem': tensor([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0]), 'labels': tensor(0)}
[RULES_START] [RULE_START] empty -> x8 , x9 [RULE_END] [RULE_START] empty -> x0 , x3 , x6 , x7 , x13 [RULE_END] [RULE_START] x1 , x7 , x16 -> x0 , x5 , x14 [RULE_END] [RULE_START] x3 , x12 , 

In [7]:
# Create HuggingFace datasets for the QED task

train_data = [get_string_rep(qed_train_dataset[i]) for i in tqdm(range(len(qed_train_dataset)))]
train_labels = [qed_train_dataset[i]["labels"].item() for i in tqdm(range(len(qed_train_dataset)))]

print("Creating train dataset")
qed_train_hf_dataset = Dataset.from_dict({
    "data": train_data,
    "label": train_labels
}).with_format("torch")

test_data = [get_string_rep(qed_test_dataset[i]) for i in tqdm(range(len(qed_test_dataset)))]
test_labels = [qed_test_dataset[i]["labels"].item() for i in tqdm(range(len(qed_test_dataset)))]

print("Creating test dataset")
qed_test_hf_dataset = Dataset.from_dict({
    "data": test_data,
    "label": test_labels
}).with_format("torch")

100%|██████████| 2500/2500 [00:05<00:00, 478.58it/s]
100%|██████████| 2500/2500 [00:03<00:00, 659.22it/s]


Creating train dataset


100%|██████████| 500/500 [00:01<00:00, 480.69it/s]
100%|██████████| 500/500 [00:00<00:00, 665.75it/s]

Creating test dataset





In [8]:
# Get the GPT-2 tokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def tokenize_function(item):
    return tokenizer(item["data"], truncation=True)

qed_train_tokenized_dataset = qed_train_hf_dataset.map(tokenize_function, batched=True)
qed_test_tokenized_dataset = qed_test_hf_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 2500/2500 [00:00<00:00, 6438.25 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6614.27 examples/s]


In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2", num_labels=2
)
model.config.pad_token_id = tokenizer.pad_token_id

In [11]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    avg_ones = np.mean(predictions)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return {"Accuracy" : acc["accuracy"], "Avg Ones" : avg_ones}

In [12]:
training_args = TrainingArguments(
    output_dir="gpt2_string_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=5,
    report_to="wandb",
    # run_name="gpt2-qed-str-tokenizer_default-vars_5-rules_8-train_2500-test_500",
    # run_name="gpt2-qed-str-tokenizer_default-vars_20-rules_10-train_2500-test_500",
    run_name="gpt2-qed-str-tokenizer_default-vars_20-rules_5-train_2500-test_500",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=qed_train_tokenized_dataset,
    eval_dataset=qed_test_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33makhare[0m ([33mtransformer_friends[0m). Use [1m`wandb login --relogin`[0m to force relogin




{'loss': 2.3253, 'learning_rate': 1.9873417721518987e-05, 'epoch': 0.06}
{'loss': 0.2685, 'learning_rate': 1.974683544303798e-05, 'epoch': 0.13}


In [14]:
wandb.finish()

