In [58]:
# External imports
import torch

from torch.optim import AdamW
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader, Dataset
#from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe

import datasets
import transformers
from transformers import AutoModel, set_seed, get_scheduler, AutoModelForCausalLM
from huggingface_hub import Repository, create_repo, notebook_login
from datasets import load_dataset
from accelerate import Accelerator

from argparse import Namespace
import logging
import json
import os

In [33]:
# Domestic imports
from model.tokenize_input import input_string_to_tokenize_expression
from model.tokens import TOKEN_TYPE_ANSWERS, TOKEN_TYPE_EXPRESSIONS
from model.equation_interpreter import Equation
from model.vocabulary import Vocabulary
from model.tokens import Token

In [34]:
# Create a combined vocabulary
vocabulary = Vocabulary.construct_from_list(TOKEN_TYPE_EXPRESSIONS + TOKEN_TYPE_ANSWERS)
vectorized_sample = vocabulary.vectorize(["#", "/", "0", "-1", "[SEP]", "TT_INTEGER"])
vectorized_sample, [vocabulary.getToken(idx) for idx in vectorized_sample]

# Global variables
model_name = "JustSumAI"
project_name = "JustSumAI"
repo_name = f"{model_name}_cleaned_gpt2_data"

In [35]:
# notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core,store).
Your token has been saved to C:\Users\Hugo\.cache\huggingface\token
Login successful


# Setup

In [36]:
with open("./data/metadata.txt", "r") as f:
    max_length = json.loads(f.read())["max_length"]
f.close()

In [37]:
config = {
    "train_batch_size": 8,
    "valid_batch_size": 8,
    "weight_decay": 0.1,
    "shuffle_buffer": 1000,
    "learning_rate": 2e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 750,
    "gradient_accumulation_steps": 16,
    "max_train_steps": 50_000,
    "max_eval_steps": -1,
    "seq_length": max_length,
    "seed": 628,
    "save_checkpoint_steps": 50_000,
    "save_dir": "./models/JustSumAI",
    "model_name": model_name
}
args = Namespace(**config)

### Load dataset

In [38]:
BATCH_SIZE = args.train_batch_size
BATCH_SIZE

8

In [39]:
dataset = load_dataset(f"Dragonoverlord3000/JustSumAI_cleaned_gpt2_data", streaming=True)
dataset

{'train': <datasets.iterable_dataset.IterableDataset at 0x1ba93bc7040>,
 'validation': <datasets.iterable_dataset.IterableDataset at 0x1ba93bc5f60>}

In [63]:
class SumDataset(Dataset):
    def __init__(self, dataset=dataset, split="train", max_length=args.seq_length, vocabulary=vocabulary):
        self.dataset = []
        for element in dataset[split]:
            self.dataset.append(
                json.loads(element["text"])
            )
        self.dataset_size = len(self.dataset)
        self.max_length = max_length
        self.vocabulary = vocabulary
        
    def __getitem__(self, idx):
        data_point = self.dataset[idx]
        # Make sure to pad output to max length
        data_point += [self.vocabulary.mask_index] * (self.max_length - len(data_point))
        data_point = torch.LongTensor(data_point)
        return data_point
    
    def __len__(self):
        return self.dataset_size

In [64]:
train_dataset = SumDataset()
eval_dataset = SumDataset(split="validation")

In [65]:
for i, data in enumerate(train_dataset):
    if i > 0: break

In [87]:
# Print sample data
print(data)
l = [vocabulary.getToken(idx.item()) for idx in data]
print(l)
eq = Equation([Token(t) for t in l[l.index("[SEP]")+1:] if t not in ["<MASK>", "<END>"]], notation="postfix")
eq

tensor([64, 10,  5, 31, 14, 31,  6, 39,  4, 10, 35, 18,  1, 33, 11, 16, 21, 67,
        51, 49, 51, 49, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 49, 61, 59, 49,
        49, 49, 61, 48, 59, 60, 49, 61, 58, 49, 48, 60, 49, 61, 58, 49, 44, 60,
        49, 61, 58, 49, 49, 53, 60, 44, 60, 49, 61, 59, 49, 49, 49, 49, 61, 49,
        53, 49, 61, 59, 61, 53, 60, 44, 60, 49, 61, 58, 49, 44, 60, 49, 49, 49,
        61, 49, 53, 49, 61, 59, 53, 60, 61, 58, 49, 49, 57, 60, 49, 61, 59, 49,
        49, 57, 60, 49, 61, 59, 49, 49, 57, 60, 49, 61, 58, 49, 49, 49, 53, 59,
        49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 59, 49, 61, 57,
        60, 49, 61, 59, 49, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 58, 49, 49,
        53, 60, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 58, 60, 59, 49, 61, 65,
        63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63])
['<BEGIN>', '-4/5', '-5/3', '4/3', '-1/2', '4/3', '-3/2', '/', '-2', '-4/5', '5/2', '-1/5', '-4', '5/3', '-3/4', '-1/3', '1/4', '[SEP]'

<model.equation_interpreter.Equation at 0x1ba953d3370>

In [89]:
eq.getMathmetaicalNotation()

'((-(Z*(((((((((((((((-(Z/Z))-((Z*((Z/Z)-EulerGamma))/Z))+((Z*((Z/Z)-EulerGamma))/Z))+((Z*EulerGamma)/Z))+((Z*Pi)/Z))-(((Z*Sqrt(Z))*Pi)/Z))+(((Z*Sqrt((Z/((Z/Z)-(Sqrt(Z)/Z)))))*Pi)/Z))+((Z*Pi)/(Z*Sqrt(((Z/Z)-(Sqrt(Z)/Z))))))-((Z*Log(Z))/Z))-((Z*Log(Z))/Z))+((Z*Log(Z))/Z))+((Z*Log(((Z-Sqrt(Z))/Z)))/Z))-(((Z*Sqrt(Z))*Log(((Z-Sqrt(Z))/Z)))/Z))+((Z*Log(((Z+Sqrt(Z))/Z)))/Z))+(((Z*Sqrt(Z))*Log(((Z+Sqrt(Z))/Z)))/Z))))/Z)'

In [90]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True)
train_dataloader, eval_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x1ba95b36f20>,
 <torch.utils.data.dataloader.DataLoader at 0x1ba95b374f0>)

In [94]:
for i,data in enumerate(train_dataloader):
    if i > 0:
        break
    test = data
    
test, test[0], BATCH_SIZE

(tensor([[64, 38, 13,  ..., 63, 63, 63],
         [64, 40, 39,  ..., 63, 63, 63],
         [64,  6, 39,  ..., 63, 63, 63],
         ...,
         [64, 27, 34,  ..., 63, 63, 63],
         [64, 40, 39,  ..., 63, 63, 63],
         [64,  9, 26,  ..., 63, 63, 63]]),
 tensor([64, 38, 13, 30, 39, 31, 27,  4, 10, 18, 14, 67, 51, 49, 49, 49, 61, 49,
         49, 49, 61, 48, 59, 60, 49, 61, 58, 49, 48, 60, 49, 61, 58, 49, 44, 60,
         49, 61, 58, 49, 49, 53, 60, 44, 60, 49, 61, 59, 49, 49, 49, 49, 61, 49,
         53, 49, 61, 59, 61, 53, 60, 44, 60, 49, 61, 59, 49, 44, 60, 49, 49, 49,
         61, 49, 53, 49, 61, 59, 53, 60, 61, 59, 49, 49, 57, 60, 49, 61, 58, 49,
         49, 57, 60, 49, 61, 58, 49, 49, 57, 60, 49, 61, 59, 49, 49, 49, 53, 59,
         49, 61, 57, 60, 49, 61, 59, 49, 49, 53, 60, 49, 49, 53, 59, 49, 61, 57,
         60, 49, 61, 58, 49, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 59, 49, 49,
         53, 60, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 59, 60, 59, 65, 63, 63,
         

### Load model

In [45]:
model = AutoModelForCausalLM.from_pretrained(f"Dragonoverlord3000/{model_name}")
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(68, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=68, bias=False)
)

In [46]:
f"GPT-2 Number of parameters: {model.num_parameters()/1_000_000:.2f}M"

'GPT-2 Number of parameters: 1477.31M'

### Define weight decay parameters

In [47]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n,p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
            
        return [{"params": params_with_wd, "weight_decay": args.weight_decay},
               {"params": params_without_wd, "weight_decay": 0.0}]

### Setting up logging for the training loop

In [48]:
def evaluate():
    model.eval()
    losses = []
    for step,batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args.valid_batch_size)
        losses.append(accelerator.gather(loss))
        if args.max_eval_steps > 0 and step >= args.max_eval_steps: break
    loss = torch.mean(torch.cat(losses))
    # Lower perplexity implies better performance
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

# Training Loop

In [49]:
# Set seed for model
set_seed(args.seed)

In [50]:
# Accelerator
accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size
samples_per_step, accelerator.is_main_process

(8, True)

In [51]:
# Clone model repository
if accelerator.is_main_process:
    hf_repo = Repository("../")

In [52]:
accelerator.state

Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu

Mixed precision type: no

In [53]:
# Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                            num_warmup_steps=args.num_warmup_steps, 
                            num_training_steps=args.max_train_steps)

accelerator.register_for_checkpointing(lr_scheduler)

In [54]:
def get_lr():
    return optimizer.param_groups[0]["lr"]

In [55]:
# Prepare everything  with our `accelerator` (order of args is not important)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

### Training Time

In [96]:
# Train model
model.train()
completed_steps = 0
for step, batch in enumerate(train_dataloader, start=1):
    loss = model(batch, labels=batch).loss
    loss /= args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % args.gradient_accumulation_steps == 0:
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
        
    if step % args.save_checkpoint_steps == 0:
        logger.info("Evaluating and saving model checkpoint")
        eval_loss, perplexity = evaluate()
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            model.save_pretrained(f"models/{model_name}", 
                      push_to_hub=True, 
                      organization="Dragonoverlord3000")
        if completed_steps >= args.max_train_steps:
            break

tensor([[64, 40, 39,  ..., 63, 63, 63],
        [64, 16, 31,  ..., 63, 63, 63],
        [64,  2, 39,  ..., 63, 63, 63],
        ...,
        [64,  2, 39,  ..., 63, 63, 63],
        [64,  2,  3,  ..., 63, 63, 63],
        [64, 22, 34,  ..., 63, 63, 63]])
tensor([[64, 33, 17,  ..., 63, 63, 63],
        [64, 28,  2,  ..., 63, 63, 63],
        [64, 40, 39,  ..., 63, 63, 63],
        ...,
        [64, 32, 38,  ..., 63, 63, 63],
        [64,  9, 14,  ..., 63, 63, 63],
        [64, 35, 11,  ..., 63, 63, 63]])


KeyboardInterrupt: 

In [62]:
# Evaluate and save the last checkpoint