In [66]:
# External imports
import torch

from torch.optim import AdamW
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader

import datasets
import transformers
from transformers import AutoModel, set_seed, get_scheduler, AutoModelForCausalLM
from huggingface_hub import Repository, create_repo
from datasets import load_dataset
from accelerate import Accelerator

from argparse import Namespace
import logging
import json

In [83]:
# Domestic imports
from model.tokenize_input import input_string_to_tokenize_expression
from model.tokens import TOKEN_TYPE_ANSWERS, TOKEN_TYPE_EXPRESSIONS
from model.equation_interpreter import Equation
from model.vocabulary import Vocabulary
from model.tokens import Token

In [26]:
# Create a combined vocabulary
vocabulary = Vocabulary.construct_from_list(TOKEN_TYPE_EXPRESSIONS + TOKEN_TYPE_ANSWERS)
vectorized_sample = vocabulary.vectorize(["#", "/", "0", "-1", "[SEP]", "TT_INTEGER"])
vectorized_sample, [vocabulary.getToken(idx) for idx in vectorized_sample]

# Global variables
model_name = "JustSumAI"
project_name = "JustSumAI"
repo_name = f"{model_name}_cleaned_gpt2_data"

# Setup

In [4]:
with open("./data/metadata.txt", "r") as f:
    max_length = json.loads(f.read())["max_length"]
f.close()

In [72]:
config = {
    "train_batch_size": 8,
    "valid_batch_size": 8,
    "weight_decay": 0.1,
    "shuffle_buffer": 1000,
    "learning_rate": 2e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 750,
    "gradient_accumulation_steps": 16,
    "max_train_steps": 50_000,
    "max_eval_steps": -1,
    "seq_length": max_length,
    "seed": 628,
    "save_checkpoint_steps": 50_000,
    "model_name": model_name
}
args = Namespace(**config)

In [55]:
create_repo(args.save_dir, exist_ok=True, repo_type="dataset")

RepoUrl('https://huggingface.co/datasets/Dragonoverlord3000/Dragonoverlord3000', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Dragonoverlord3000/Dragonoverlord3000')

### Load dataset

In [6]:
BATCH_SIZE = args.train_batch_size
BATCH_SIZE

8

In [7]:
dataset = load_dataset(f"Dragonoverlord3000/{repo_name}")
dataset

Found cached dataset parquet (C:/Users/Hugo/.cache/huggingface/datasets/Dragonoverlord3000___parquet/Dragonoverlord3000--JustSumAI_cleaned_gpt2_data-5a0747bdc3dcbe7d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 316
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 20
    })
})

In [91]:
# Print sample data
print(json.loads(dataset["train"][0]["text"]))
l = [vocabulary.getToken(idx) for idx in json.loads(dataset["train"][0]["text"])]
print(l)
eq = Equation([Token(t) for t in l[l.index("[SEP]")+1:-1]], notation="postfix")
eq.getMathmetaicalNotation()

[64, 28, 39, 14, 30, 15, 0, 5, 26, 67, 51, 49, 51, 49, 49, 61, 59, 49, 48, 60, 49, 61, 58, 51, 49, 49, 61, 59, 49, 49, 49, 61, 48, 59, 60, 58, 48, 58, 49, 61, 58, 49, 44, 60, 49, 61, 58, 49, 49, 53, 60, 44, 60, 49, 61, 59, 49, 49, 49, 49, 61, 49, 53, 49, 61, 58, 61, 53, 60, 44, 60, 49, 61, 58, 49, 44, 60, 49, 49, 49, 61, 49, 53, 49, 61, 58, 53, 60, 61, 59, 49, 49, 57, 60, 49, 61, 59, 49, 49, 57, 60, 49, 61, 58, 49, 49, 57, 60, 49, 61, 58, 49, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 59, 49, 61, 57, 60, 49, 61, 58, 49, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 58, 49, 49, 53, 60, 49, 49, 53, 58, 49, 61, 57, 60, 49, 61, 59, 60, 59, 65]
['<BEGIN>', '4/5', '/', '-1/2', '5/4', '-2/5', '-5', '-5/3', '2/3', '[SEP]', 'TT_ZERO', 'TT_INTEGER', 'TT_ZERO', 'TT_INTEGER', 'TT_INTEGER', 'TT_DIVIDE', 'TT_MINUS', 'TT_INTEGER', 'TT_EULERGAMMA', 'TT_MULTIPLY', 'TT_INTEGER', 'TT_DIVIDE', 'TT_PLUS', 'TT_ZERO', 'TT_INTEGER', 'TT_INTEGER', 'TT_DIVIDE', 'TT_MINUS', 'TT_INTEGER', '

'(-(Z*((((((((((((((-(Z/Z))+((Z*EulerGamma)/Z))+((((-(Z/Z))+(Z*((Z/Z)-EulerGamma)))+EulerGamma)/Z))+((Z*Pi)/Z))-(((Z*Sqrt(Z))*Pi)/Z))+(((Z*Sqrt((Z/((Z/Z)+(Sqrt(Z)/Z)))))*Pi)/Z))-((Z*Pi)/(Z*Sqrt(((Z/Z)+(Sqrt(Z)/Z))))))-((Z*Log(Z))/Z))+((Z*Log(Z))/Z))+((Z*Log(Z))/Z))+((Z*Log(((Z-Sqrt(Z))/Z)))/Z))+(((Z*Sqrt(Z))*Log(((Z-Sqrt(Z))/Z)))/Z))+((Z*Log(((Z+Sqrt(Z))/Z)))/Z))-(((Z*Sqrt(Z))*Log(((Z+Sqrt(Z))/Z)))/Z))))'

In [90]:
train_dataloader = DataLoader(dataset["train"], shuffle=True, batch_size=BATCH_SIZE)
eval_dataloader = DataLoader(dataset["validation"], shuffle=True, batch_size=BATCH_SIZE)
train_dataloader, eval_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x167a7844280>,
 <torch.utils.data.dataloader.DataLoader at 0x167a7844880>)

### Load model

In [78]:
model = AutoModelForCausalLM.from_pretrained(f"models/{model_name}")
model

loading configuration file models/JustSumAI\config.json
Model config GPT2Config {
  "_name_or_path": "models/JustSumAI",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.27.4",
  "use_cache": true,
 

TypeError: GPT2LMHeadModel.__init__() got an unexpected keyword argument 'organization'

In [11]:
f"GPT-2 Number of parameters: {model.num_parameters()/1_000_000:.2f}M"

'GPT-2 Number of parameters: 1477.31M'

### Define weight decay parameters

In [12]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n,p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
            
        return [{"params": params_with_wd, "weight_decay": args.weight_decay},
               {"params": params_without_wd, "weight_decay": 0.0}]

### Setting up logging for the training loop

In [14]:
def evaluate():
    model.eval()
    losses = []
    for step,batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args.valid_batch_size)
        losses.append(accelerator.gather(loss))
        if args.max_eval_steps > 0 and step >= args.max_eval_steps: break
    loss = torch.mean(torch.cat(losses))
    # Lower perplexity implies better performance
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

In [67]:
# Clone model repository
if accelerator.is_main_process:
    hf_repo = Repository("../")

# Training Loop

In [15]:
# Set seed for model
set_seed(args.seed)

In [22]:
# Accelerator
accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size
samples_per_step, accelerator.is_main_process

(8, True)

In [65]:
accelerator.state

Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu

Mixed precision type: no

In [33]:
# Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                            num_warmup_steps=args.num_warmup_steps, 
                            num_training_steps=args.max_train_steps)

In [34]:
def get_lr():
    return optimizer.param_groups[0]["lr"]

In [35]:
# Prepare everything  with our `accelerator` (order of args is not important)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

In [73]:
# Train model
model.train()
completed_steps = 0
for step, batch in enumerate(train_dataloader, start=1):
    loss = model(batch, labels=batch).loss
    loss /= args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % args.gradient_accumulation_steps == 0:
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
    if step % args.save_checkpoint_steps == 0:
        logger.info("Evaluating and saving model checkpoint")
        eval_loss, perplexity = evaluate()
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            model.save_pretrained(f"models/{model_name}", 
                      push_to_hub=True, 
                      organization="Dragonoverlord3000")
        if completed_steps >= args.max_train_steps:
            break

TypeError: GPT2Model.forward() got an unexpected keyword argument 'labels'

In [None]:
# Evaluate and save the last checkpoint