In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, get_scheduler
from model.equation_interpreter import Equation
from torch.utils.data import IterableDataset, DataLoader
from argparse import Namespace
from typing import List
import linecache
import json
import os

import torch
import torch.nn.functional as F
from datasets import load_dataset
from accelerate import Accelerator
from huggingface_hub import Repository
from torch.optim import AdamW
from tqdm import tqdm

In [2]:
# Domestic imports
from model.tokens import TOKEN_TYPE_ANSWERS, TOKEN_TYPE_EXPRESSIONS, Token
from model.equation_interpreter import Equation, EquationLexer
from model.vocabulary import Vocabulary

In [3]:
model_ckpt = "JustSumAI2"
org = "Dragonoverlord3000"
model_id = f"{org}/{model_ckpt}"

# Token Time
We update the vocabulary of GPT-2 with our

In [4]:
def expression2tokens(expression):
    """
    Example:
        '{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n' ---> ['#', '/', '-2', '-2', '-10', '-8', '-9', '-8', '-10', '-5', '-6', '-4']

    """
    LHS, RHS = expression.split("}, {")
    LHS, RHS = LHS.lstrip("{"), RHS.rstrip("\n").rstrip("}")
    if len(LHS) == 0:
        LHS = ["#"]
    else:
        LHS = LHS.split(", ")
    RHS = RHS.split(", ")    
    return LHS + ["/"] + RHS
expression2tokens('{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n')

['#', '/', '-2', '-2', '-10', '-8', '-9', '-8', '-10', '-5', '-6', '-4']

In [5]:
def answer2tokens(answer):
    """
    Example:
        '(571057069 - 57859200*Pi^2)/1365590016000\n' ---> [Token(TT_INTEGER), Token(TT_INTEGER), Token(TT_PI), Token(TT_INTEGER), Token(TT_POW), Token(TT_MULTIPLY), Token(TT_MINUS), Token(TT_INTEGER), Token(TT_DIVIDE)]
    """
    equation = Equation.makeEquationFromString(answer)
    equation.convertToPostfix()
    if equation.notation != "postfix": return None
    return [token.t_type for token in equation.tokenized_equation]
answer2tokens('(571057069 - 57859200*Pi^2)/1365590016000\n')

['TT_INTEGER',
 'TT_INTEGER',
 'TT_PI',
 'TT_INTEGER',
 'TT_POW',
 'TT_MULTIPLY',
 'TT_MINUS',
 'TT_INTEGER',
 'TT_DIVIDE']

In [6]:
vocabulary = Vocabulary.construct_from_list(TOKEN_TYPE_EXPRESSIONS + TOKEN_TYPE_ANSWERS)
base_vocab = (list(vocabulary.token2index.keys())[:-5] + ["[SEP]"])
base_vocab.remove("TT_RATIONAL")
base_vocab.remove("TT_VARIABLE")
base_vocab, len(base_vocab)

(['-10',
  '-9',
  '-8',
  '-7',
  '-6',
  '-5',
  '-4',
  '-3',
  '-2',
  '-1',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '/',
  '#',
  'TT_LEFT_PARENTHESIS',
  'TT_RIGHT_PARENTHESIS',
  'TT_PI',
  'TT_E',
  'TT_PHI',
  'TT_CATALAN',
  'TT_EULERGAMMA',
  'TT_INTEGER',
  'TT_ZERO',
  'TT_ONE',
  'TT_SQRT',
  'TT_SIN',
  'TT_COS',
  'TT_TAN',
  'TT_LOG',
  'TT_PLUS',
  'TT_MINUS',
  'TT_MULTIPLY',
  'TT_DIVIDE',
  'TT_POW',
  '[SEP]'],
 44)

In [7]:
def dummy_iterator():
    yield []

In [8]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = tokenizer.train_new_from_iterator(
    dummy_iterator(),
    vocab_size=257 # Minimum allowed size
) # Just a way to initialize a new tokenizer of the same 'type' meant for GPT2
tokenizer.add_tokens(base_vocab)
print(tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})


In [9]:
# Sanity check
assert len([token for token in tokenizer.get_vocab() if token in base_vocab]) == len(base_vocab)

In [10]:
test_encoded = tokenizer.encode("TT_INTEGER TT_PLUS TT_PI")
test_encoded, tokenizer.convert_ids_to_tokens(test_encoded)

([275, 221, 283, 221, 270], ['TT_INTEGER', 'Ġ', 'TT_PLUS', 'Ġ', 'TT_PI'])

In [11]:
tokenizer.save_pretrained(model_id, push_to_hub=True)

('Dragonoverlord3000/JustSumAI2\\tokenizer_config.json',
 'Dragonoverlord3000/JustSumAI2\\special_tokens_map.json',
 'Dragonoverlord3000/JustSumAI2\\vocab.json',
 'Dragonoverlord3000/JustSumAI2\\merges.txt',
 'Dragonoverlord3000/JustSumAI2\\added_tokens.json',
 'Dragonoverlord3000/JustSumAI2\\tokenizer.json')

# Model Time

In [12]:
config = AutoConfig.from_pretrained("gpt2", vocab_size=len(tokenizer))
model = AutoModelForCausalLM.from_config(config)
print(f"GPT-2 Number of parameters: {model.num_parameters()/1_000_000:.2f}M")

GPT-2 Number of parameters: 86.06M


In [13]:
model.save_pretrained(model_id, push_to_hub=True)
print(model)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/357M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(289, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=289, bias=False)
)


# Dataset

In [14]:
config = {
    "answers_dir": "./data/answers-1000.txt",
    "expressions_dir": "./data/expressions-1000.txt",
    "train_batch_size": 8,
    "valid_batch_size": 8,
    "weight_decay": 0.1,
    "shuffle_buffer": 1000,
    "learning_rate": 2e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 750,
    "gradient_accumulation_steps": 16,
    "max_train_steps": 50_000,
    "max_eval_steps": -1,
    "seq_length": 1024, # We use a buffer to fill out the seq_length
    "seed": 628,
    "save_checkpoint_steps": 50_000,
    "save_dir": "./models/JustSumAI",
    "model_name": model_ckpt,
    "num_epochs": 100
}
args = Namespace(**config)

In [15]:
class SumDataset(IterableDataset):
    def __init__(self, tokenizer, answers_dir=args.answers_dir, expressions_dir=args.expressions_dir, seq_length=args.seq_length) -> None:
        """
        Args:
            answers_dir (str): directory to file containing string equations e.g. '(571057069 - 57859200*Pi^2)/1365590016000\n'
            expressions_dir (str): directory to file containing expressions e.g. '{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n'
            
        Note: Initializatio of the dataset might take a few seconds
        """
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.input_characters = seq_length * 3.6 * 1024
        
        self.expressions_dir = expressions_dir
        self.answers_dir = answers_dir
        self.seq_length = seq_length
        with open(expressions_dir, "r") as fe:
            self.dataset_size = len(fe.read().split("\n"))
        fe.close()
        self.dataset = []
        for line_num in range(1,self.dataset_size):
            LHS, RHS = linecache.getline(expressions_dir, line_num), linecache.getline(answers_dir, line_num)
            try:
                LHS = expression2tokens(LHS)
                RHS = answer2tokens(RHS)
                if LHS and RHS:
                    self.dataset.append(" ".join(LHS + ["[SEP]"] + RHS))
            except:
                pass
    
    def __iter__(self) -> torch.Tensor:
        iterator = iter(self.dataset)
        # This is an infinite generator over the dataset
        while True:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    m=f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
                    print(m)
                    break
                try:
                    m=f"Fill buffer: {buffer_len}<{self.input_characters:.0f}"
                    print(m)
                    buffer.append(next(iterator))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    iterator = iter(self.dataset)

            all_token_ids = []
            tokenized_inputs = self.tokenizer(buffer, truncation=False)
            for tokenized_input in tokenized_inputs['input_ids']:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)
        
    
    def __len__(self) -> int:
        return self.dataset_size

In [16]:
constant_length_dataset = SumDataset(tokenizer)
constant_length_dataset.dataset

Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERRO

ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
Unknown token encountered: [PolyGamma]
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
ERROR unrecognised token: $,  position: 0
E

['-1 / 3/5 1/3 1/4 -5 -2 2/3 2/5 [SEP] TT_ZERO TT_INTEGER TT_INTEGER TT_INTEGER TT_DIVIDE TT_EULERGAMMA TT_MINUS TT_INTEGER TT_DIVIDE TT_ZERO TT_INTEGER TT_INTEGER TT_DIVIDE TT_MINUS TT_EULERGAMMA TT_PLUS TT_INTEGER TT_DIVIDE TT_PLUS TT_INTEGER TT_EULERGAMMA TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_PLUS TT_INTEGER TT_PI TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_PLUS TT_INTEGER TT_INTEGER TT_SQRT TT_MULTIPLY TT_PI TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_MINUS TT_INTEGER TT_INTEGER TT_INTEGER TT_INTEGER TT_DIVIDE TT_INTEGER TT_SQRT TT_INTEGER TT_DIVIDE TT_PLUS TT_DIVIDE TT_SQRT TT_MULTIPLY TT_PI TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_PLUS TT_INTEGER TT_PI TT_MULTIPLY TT_INTEGER TT_INTEGER TT_INTEGER TT_DIVIDE TT_INTEGER TT_SQRT TT_INTEGER TT_DIVIDE TT_PLUS TT_SQRT TT_MULTIPLY TT_DIVIDE TT_MINUS TT_INTEGER TT_INTEGER TT_LOG TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_MINUS TT_INTEGER TT_INTEGER TT_LOG TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_PLUS TT_INTEGER TT_INTEGER TT_LOG TT_MULTIPLY TT_INTEGER TT_DIVIDE TT_MINUS TT

In [17]:
for i,data in enumerate(constant_length_dataset):
    if i > 0: break
    print(data, len(data))

Fill buffer: 0<3774874
Fill buffer: 1555<3774874
Fill buffer: 3167<3774874
Fill buffer: 4867<3774874
Fill buffer: 5088<3774874
Fill buffer: 6117<3774874
Fill buffer: 6439<3774874
Fill buffer: 8089<3774874
Fill buffer: 9507<3774874
Fill buffer: 10894<3774874
Fill buffer: 12206<3774874
Fill buffer: 13551<3774874
Fill buffer: 15095<3774874
Fill buffer: 16611<3774874
Fill buffer: 18122<3774874
Fill buffer: 19658<3774874
Fill buffer: 21056<3774874
Fill buffer: 21359<3774874
Fill buffer: 22850<3774874
Fill buffer: 23159<3774874
Fill buffer: 24734<3774874
Fill buffer: 26309<3774874
Fill buffer: 27829<3774874
Fill buffer: 29611<3774874
Fill buffer: 29876<3774874
Fill buffer: 31121<3774874
Fill buffer: 32669<3774874
Fill buffer: 34150<3774874
Fill buffer: 34506<3774874
Fill buffer: 34795<3774874
Fill buffer: 36314<3774874
Fill buffer: 37724<3774874
Fill buffer: 39241<3774874
Fill buffer: 40686<3774874
Fill buffer: 42143<3774874
Fill buffer: 42465<3774874
Fill buffer: 43581<3774874
Fill buffer: 

tensor([266, 221,  15,  ..., 221, 275, 221]) 1024


In [18]:
dataloader = DataLoader(constant_length_dataset, batch_size=args.train_batch_size)
dataloader

<torch.utils.data.dataloader.DataLoader at 0x13e2cc69270>

# Model
https://github.com/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb

In [19]:
model = AutoModelForCausalLM.from_pretrained(f"Dragonoverlord3000/{model_ckpt}")
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(289, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=289, bias=False)
)


### Weight decay parameters

In [20]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n,p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
            
        return [{"params": params_with_wd, "weight_decay": args.weight_decay},
               {"params": params_without_wd, "weight_decay": 0.0}]

# Training Loop

In [21]:
def evaluate():
    model.eval()
    losses = []
    for step,batch in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(batch["data"], labels=batch["labels"])
        loss = outputs.loss.repeat(args.valid_batch_size)
        losses.append(accelerator.gather(loss))
        if args.max_eval_steps > 0 and step >= args.max_eval_steps: break
    loss = torch.mean(torch.cat(losses))
    # Lower perplexity implies better performance
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

In [22]:
# Accelerator
accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size
print(samples_per_step, accelerator.is_main_process)

8 True


In [23]:
# Clone model repository
if accelerator.is_main_process:
    hf_repo = Repository("../")
print(accelerator.state)

Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu

Mixed precision type: no



In [24]:
# Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                            num_warmup_steps=args.num_warmup_steps, 
                            num_training_steps=args.max_train_steps)

accelerator.register_for_checkpointing(lr_scheduler)

In [25]:
def get_lr():
    return optimizer.param_groups[0]["lr"]

In [26]:
# Prepare everything  with our `accelerator` (order of args is not important)
model, optimizer, dataloader, dataloader = accelerator.prepare(
    model, optimizer, dataloader, dataloader)

### Training

In [27]:
# Train model
model.train()
completed_steps = 0
for epoch in tqdm(range(args.num_epochs)):
    for step, batch in tqdm(enumerate(dataloader, start=1)):
        loss = model(batch, labels=batch).loss
        loss /= args.gradient_accumulation_steps
        accelerator.backward(loss)
        if step % args.gradient_accumulation_steps == 0:

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1

        if step % args.save_checkpoint_steps == 0:
            eval_loss, perplexity = evaluate()
            accelerator.wait_for_everyone()
            if accelerator.is_main_process:
                model.save_pretrained(f"models/{model_ckpt}", 
                          push_to_hub=True, 
                          organization="Dragonoverlord3000")

            model.train()
            if completed_steps >= args.max_train_steps:
                break

  0%|                                                                                           | 0/100 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

Fill buffer: 0<3774874
Fill buffer: 1555<3774874
Fill buffer: 3167<3774874
Fill buffer: 4867<3774874
Fill buffer: 5088<3774874
Fill buffer: 6117<3774874
Fill buffer: 6439<3774874
Fill buffer: 8089<3774874
Fill buffer: 9507<3774874
Fill buffer: 10894<3774874
Fill buffer: 12206<3774874
Fill buffer: 13551<3774874
Fill buffer: 15095<3774874
Fill buffer: 16611<3774874
Fill buffer: 18122<3774874
Fill buffer: 19658<3774874
Fill buffer: 21056<3774874
Fill buffer: 21359<3774874
Fill buffer: 22850<3774874
Fill buffer: 23159<3774874
Fill buffer: 24734<3774874
Fill buffer: 26309<3774874
Fill buffer: 27829<3774874
Fill buffer: 29611<3774874
Fill buffer: 29876<3774874
Fill buffer: 31121<3774874
Fill buffer: 32669<3774874
Fill buffer: 34150<3774874
Fill buffer: 34506<3774874
Fill buffer: 34795<3774874
Fill buffer: 36314<3774874
Fill buffer: 37724<3774874
Fill buffer: 39241<3774874
Fill buffer: 40686<3774874
Fill buffer: 42143<3774874
Fill buffer: 42465<3774874
Fill buffer: 43581<3774874
Fill buffer: 


1it [02:48, 168.30s/it][A
2it [05:26, 162.20s/it][A
3it [09:31, 190.46s/it][A
  0%|                                                                                           | 0/100 [09:32<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model.save_pretrained(f"models/{model_ckpt}", 
                          push_to_hub=True, 
                          organization="Dragonoverlord3000")