In [27]:
# External imports
import torch

from torch.optim import AdamW
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader

import datasets
import transformers
from transformers import AutoModel, set_seed, get_scheduler, AutoModelForCausalLM
from huggingface_hub import Repository, create_repo, notebook_login
from datasets import load_dataset
from accelerate import Accelerator

from argparse import Namespace
import logging
import json
import os

In [28]:
# Domestic imports
from model.tokenize_input import input_string_to_tokenize_expression
from model.tokens import TOKEN_TYPE_ANSWERS, TOKEN_TYPE_EXPRESSIONS
from model.equation_interpreter import Equation
from model.vocabulary import Vocabulary
from model.tokens import Token

In [29]:
# Create a combined vocabulary
vocabulary = Vocabulary.construct_from_list(TOKEN_TYPE_EXPRESSIONS + TOKEN_TYPE_ANSWERS)
vectorized_sample = vocabulary.vectorize(["#", "/", "0", "-1", "[SEP]", "TT_INTEGER"])
vectorized_sample, [vocabulary.getToken(idx) for idx in vectorized_sample]

# Global variables
model_name = "JustSumAI"
project_name = "JustSumAI"
repo_name = f"{model_name}_cleaned_gpt2_data"

In [30]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager,manager-core,store).
Your token has been saved to C:\Users\hugom\.cache\huggingface\token
Login successful


# Setup

In [31]:
with open("./data/metadata.txt", "r") as f:
    max_length = json.loads(f.read())["max_length"]
f.close()

In [32]:
config = {
    "train_batch_size": 8,
    "valid_batch_size": 8,
    "weight_decay": 0.1,
    "shuffle_buffer": 1000,
    "learning_rate": 2e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 750,
    "gradient_accumulation_steps": 16,
    "max_train_steps": 50_000,
    "max_eval_steps": -1,
    "seq_length": max_length,
    "seed": 628,
    "save_checkpoint_steps": 50_000,
    "save_dir": "./models/JustSumAI",
    "model_name": model_name
}
args = Namespace(**config)

### Load dataset

In [33]:
BATCH_SIZE = args.train_batch_size
BATCH_SIZE

8

In [40]:
dataset = load_dataset(f"Dragonoverlord3000/JustSumAI_cleaned_gpt2_data", streaming=True)
dataset

{'train': <datasets.iterable_dataset.IterableDataset at 0x2033a23bf10>,
 'validation': <datasets.iterable_dataset.IterableDataset at 0x2033a23b400>}

In [41]:
for i, data in enumerate(dataset["train"]):
    if i > 0: break

In [42]:
# Print sample data
print(json.loads(data["text"]))
l = [vocabulary.getToken(idx) for idx in json.loads(data["text"])]
print(l)
eq = Equation([Token(t) for t in l[l.index("[SEP]")+1:-1]], notation="postfix")
eq.getMathmetaicalNotation()

[64, 10, 5, 31, 14, 31, 6, 39, 4, 10, 35, 18, 1, 33, 11, 16, 21, 67, 51, 49, 51, 49, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 49, 61, 58, 49, 48, 60, 49, 61, 58, 49, 44, 60, 49, 61, 58, 49, 49, 53, 60, 44, 60, 49, 61, 59, 49, 49, 49, 49, 61, 49, 53, 49, 61, 59, 61, 53, 60, 44, 60, 49, 61, 58, 49, 44, 60, 49, 49, 49, 61, 49, 53, 49, 61, 59, 53, 60, 61, 58, 49, 49, 57, 60, 49, 61, 59, 49, 49, 57, 60, 49, 61, 59, 49, 49, 57, 60, 49, 61, 58, 49, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 59, 49, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 58, 60, 59, 49, 61, 65]
['<BEGIN>', '-4/5', '-5/3', '4/3', '-1/2', '4/3', '-3/2', '/', '-2', '-4/5', '5/2', '-1/5', '-4', '5/3', '-3/4', '-1/3', '1/4', '[SEP]', 'TT_ZERO', 'TT_INTEGER', 'TT_ZERO', 'TT_INTEGER', 'TT_INTEGER', 'TT_DIVIDE', 'TT_MINUS', 'TT_INTEGER', 'TT_INTEGER', 'TT_INTEGER', 'TT_DIVIDE', 'T

'((-(Z*(((((((((((((((-(Z/Z))-((Z*((Z/Z)-EulerGamma))/Z))+((Z*((Z/Z)-EulerGamma))/Z))+((Z*EulerGamma)/Z))+((Z*Pi)/Z))-(((Z*Sqrt(Z))*Pi)/Z))+(((Z*Sqrt((Z/((Z/Z)-(Sqrt(Z)/Z)))))*Pi)/Z))+((Z*Pi)/(Z*Sqrt(((Z/Z)-(Sqrt(Z)/Z))))))-((Z*Log(Z))/Z))-((Z*Log(Z))/Z))+((Z*Log(Z))/Z))+((Z*Log(((Z-Sqrt(Z))/Z)))/Z))-(((Z*Sqrt(Z))*Log(((Z-Sqrt(Z))/Z)))/Z))+((Z*Log(((Z+Sqrt(Z))/Z)))/Z))+(((Z*Sqrt(Z))*Log(((Z+Sqrt(Z))/Z)))/Z))))/Z)'

In [44]:
train_dataloader = DataLoader(dataset["train"], batch_size=BATCH_SIZE)#,shuffle=True)
eval_dataloader = DataLoader(dataset["validation"], batch_size=BATCH_SIZE)#,shuffle=True)
train_dataloader, eval_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x2033a23b070>,
 <torch.utils.data.dataloader.DataLoader at 0x2033a23bbb0>)

In [45]:
def data_processer(batch):
    output = []
    for element in batch["text"]:
        element = json.loads(element)
        # Make sure to pad output to max length
        output.append(
            element + [vocabulary.mask_index] * (args.seq_length - len(element))
        )
        
    output = torch.LongTensor(output)
    return output

In [46]:
for i,data_ in enumerate(train_dataloader):
    if i > 0:
        break
    data = data_processer(data_)
    
data, data[0]

(tensor([[64, 28, 39,  ..., 63, 63, 63],
         [64, 10,  5,  ..., 63, 63, 63],
         [64, 33, 39,  ..., 63, 63, 63],
         ...,
         [64, 37, 39,  ..., 63, 63, 63],
         [64, 34, 39,  ..., 63, 63, 63],
         [64,  0, 10,  ..., 63, 63, 63]]),
 tensor([64, 28, 39, 14, 30, 15,  0,  5, 26, 67, 51, 49, 51, 49, 49, 61, 59, 49,
         48, 60, 49, 61, 58, 51, 49, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 58,
         48, 58, 49, 61, 58, 49, 44, 60, 49, 61, 58, 49, 49, 53, 60, 44, 60, 49,
         61, 59, 49, 49, 49, 49, 61, 49, 53, 49, 61, 58, 61, 53, 60, 44, 60, 49,
         61, 58, 49, 44, 60, 49, 49, 49, 61, 49, 53, 49, 61, 58, 53, 60, 61, 59,
         49, 49, 57, 60, 49, 61, 59, 49, 49, 57, 60, 49, 61, 58, 49, 49, 57, 60,
         49, 61, 58, 49, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53,
         60, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 49, 53, 58, 49,
         61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 58, 49, 61, 57, 60,
         

### Load model

In [49]:
model = AutoModelForCausalLM.from_pretrained(f"Dragonoverlord3000/{model_name}")
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(68, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): D

In [50]:
f"GPT-2 Number of parameters: {model.num_parameters()/1_000_000:.2f}M"

'GPT-2 Number of parameters: 1477.31M'

### Define weight decay parameters

In [51]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n,p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
            
        return [{"params": params_with_wd, "weight_decay": args.weight_decay},
               {"params": params_without_wd, "weight_decay": 0.0}]

### Setting up logging for the training loop

In [52]:
def evaluate():
    model.eval()
    losses = []
    for step,batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args.valid_batch_size)
        losses.append(accelerator.gather(loss))
        if args.max_eval_steps > 0 and step >= args.max_eval_steps: break
    loss = torch.mean(torch.cat(losses))
    # Lower perplexity implies better performance
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

# Training Loop

In [53]:
# Set seed for model
set_seed(args.seed)

In [54]:
# Accelerator
accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size
samples_per_step, accelerator.is_main_process

(8, True)

In [55]:
# Clone model repository
if accelerator.is_main_process:
    hf_repo = Repository("../")

In [56]:
accelerator.state

Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu

Mixed precision type: no

In [57]:
# Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                            num_warmup_steps=args.num_warmup_steps, 
                            num_training_steps=args.max_train_steps)

accelerator.register_for_checkpointing(lr_scheduler)

In [58]:
def get_lr():
    return optimizer.param_groups[0]["lr"]

In [59]:
# Prepare everything  with our `accelerator` (order of args is not important)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

### Training Time

In [70]:
# Train model
model.train()
completed_steps = 0
print(2)
for step, batch in enumerate(dataset["train"], start=1):
    print(batch)
    batch = torch.LongTensor([json.loads(batch["text"])])
    loss = model(batch, labels=batch).loss
    loss /= args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % args.gradient_accumulation_steps == 0:
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
        
    if step % args.save_checkpoint_steps == 0:
        logger.info("Evaluating and saving model checkpoint")
        eval_loss, perplexity = evaluate()
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            model.save_pretrained(f"models/{model_name}", 
                      push_to_hub=True, 
                      organization="Dragonoverlord3000")
        if completed_steps >= args.max_train_steps:
            break

2
{'text': '[64, 28, 39, 14, 30, 15, 0, 5, 26, 67, 51, 49, 51, 49, 49, 61, 59, 49, 48, 60, 49, 61, 58, 51, 49, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 58, 48, 58, 49, 61, 58, 49, 44, 60, 49, 61, 58, 49, 49, 53, 60, 44, 60, 49, 61, 59, 49, 49, 49, 49, 61, 49, 53, 49, 61, 58, 61, 53, 60, 44, 60, 49, 61, 58, 49, 44, 60, 49, 49, 49, 61, 49, 53, 49, 61, 58, 53, 60, 61, 59, 49, 49, 57, 60, 49, 61, 59, 49, 49, 57, 60, 49, 61, 58, 49, 49, 57, 60, 49, 61, 58, 49, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 59, 60, 59, 65]'}
{'text': '[64, 10, 5, 31, 14, 31, 6, 39, 4, 10, 35, 18, 1, 33, 11, 16, 21, 67, 51, 49, 51, 49, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 49, 61, 58, 49, 48, 60, 49, 61, 58, 49, 44, 60, 49, 61, 58, 49, 49, 53, 60, 44, 60, 49, 61, 59, 49, 49, 49, 49, 61, 49, 53, 49, 61, 59, 61, 

In [62]:
# Evaluate and save the last checkpoint