In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/binGPT tests/binGPT"
%pwd

/content/drive/MyDrive/binGPT tests/binGPT


'/content/drive/MyDrive/binGPT tests/binGPT'

In [3]:
!python --version

Python 3.11.11


In [4]:
!pip install -e .

Obtaining file:///content/drive/MyDrive/binGPT%20tests/binGPT
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: binGPT
  Running setup.py develop for binGPT
Successfully installed binGPT-0.1.0


In [5]:
model_dir = "./models/binary/"

import os
# check is dir exist if not create it
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

In [6]:
%ls

[0m[01;34mbinGPT.egg-info[0m/  [01;34mmingpt[0m/  [01;34mmodels[0m/  README.md  [01;34mscripts[0m/  setup.py  [01;34mutils[0m/


In [7]:
from utils.tentmapdataset import TentDataset

# print an example instance of the dataset
n = 4
length = 22
train_dataset = TentDataset("train", length=length, n_iterations=n) #, type="decimal"
test_dataset = TentDataset("test", length=length, n_iterations=n) #, type="decimal"

x, y = train_dataset[0]

print("x:", x)
print("y:", y)

x, y = test_dataset[0]

print("x:", x)
print("y:", y)

x: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y: tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0])
x: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])
y: tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1,  0,  0,  0,  0,  0,  0])


In [8]:

# create a GPT instance
from mingpt.model import GPT
from mingpt.utils import CfgNode as CN

model_config = CN(
    n_layer=3,
    n_head=3,
    n_embd=2**4 * 3,
    model_type=None,
    vocab_size=train_dataset.get_vocab_size(),
    block_size=train_dataset.get_block_size(),
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    resid_pdrop=0.1,
)

model = GPT(model_config)

print(f"Number of training samples: {len(train_dataset):.3e}")
print(f"Number of test samples: {len(test_dataset):.3e}")

number of parameters: 8.707e+04
Number of training samples: 3.146e+06
Number of test samples: 1.049e+06


In [9]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 3e-4
train_config.batch_size = 64*64
train_config.max_iters = (len(train_dataset)/train_config.batch_size)*8 #6000
train_config.num_workers = os.cpu_count()
print(train_config)

trainer = Trainer(train_config, model, train_dataset)
# %%


def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(
            f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}"
        )


trainer.set_callback("on_batch_end", batch_end_callback)


device: auto
num_workers: 2
max_iters: 6144.0
batch_size: 4096
learning_rate: 0.0003
betas: (0.9, 0.95)
weight_decay: 0.1
grad_norm_clip: 1.0

running on device cuda


In [10]:
print("Number of iterations:", len(train_dataset)/train_config.batch_size)
print("Number of epochs:", train_config.max_iters / (len(train_dataset)/train_config.batch_size))

Number of iterations: 768.0
Number of epochs: 8.0


In [11]:
import torch

In [12]:
# if os.path.join(model_dir, "model.pt") load, else train
if os.path.exists(os.path.join(model_dir, "model.pt")):
    print("Loading model from disk...")
    model.load_state_dict(torch.load(os.path.join(model_dir, "model.pt")))
else:
    print("Training model...")
    trainer.run()


Loading model from disk...


In [13]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(2, 48)
    (wpe): Embedding(43, 48)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-2): 3 x Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=48, out_features=144, bias=True)
          (c_proj): Linear(in_features=48, out_features=48, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=48, out_features=192, bias=True)
          (c_proj): Linear(in_features=192, out_features=48, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=48, out_f

In [14]:
# save model to model_dir
if os.path.exists(os.path.join(model_dir, "model.pt")):
  print("Model exist will not be overwritten.")
else:
  print("Model saving...")
  torch.save(model.state_dict(), os.path.join(model_dir, "model.pt"))

Model exist will not be overwritten.


In [15]:
from torch.utils.data.dataloader import DataLoader


def eval_split(model, split, max_batches, device):
    dataset = {"train": train_dataset, "test": test_dataset}[split]
    n = train_dataset.length
    results = []
    mistakes = []
    incorrect_preds = []
    correct_preds = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(device)
        y = y.to(device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model.generate(
            inp, n, do_sample=False
        )  # using greedy argmax, not sampling
        sol_candidate = cat[:, n:]  # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu()
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if (
                not correct[i] and "".join(map(str, inp[i].tolist())) not in mistakes
            ):  # and mistakes_printed_already < 3  # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                mistakes.append("".join(map(str, inp[i].tolist())))
                print(
                    "GPT claims that %s -> %s but g.t. is %s"
                    % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist())
                )
                incorrect_preds.append(
                    (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist())
                )
            else:
                correct_preds.append(
                    (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist())
                )
        if max_batches is not None and b + 1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print(
        "%s final score: %d/%d = %.2f%% correct"
        % (split, rt.sum(), len(results), 100 * rt.mean())
    )
    return correct_preds, incorrect_preds

In [18]:
eval_type = "train"

In [16]:
# run a lot of examples from both train and test through the model and verify the output correctness
 # "test"
with torch.no_grad():
    results = eval_split(model, eval_type, max_batches=None, device =trainer.device)


GPT claims that [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] -> [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] but g.t. is [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
GPT claims that [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] -> [1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] but g.t. is [1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
GPT claims that [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] -> [1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] but g.t. is [1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
GPT claims that [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] -> [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] but g.t. is [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
GPT claims that [0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [20]:
import numpy as np

correct_preds, incorrect_preds = results

# save correct_preds
# np.save(os.path.join(model_dir, f"{eval_type} correct_preds.npy"), correct_preds)
np.save(os.path.join(model_dir, f"{eval_type} incorrect_preds.npy"), incorrect_preds)

In [None]:
# let's run a random given sequence through the model as well
n = train_dataset.length  # naugy direct access shrug
inp, sol = train_dataset[3]
inp = inp[:n]
sol = sol[-n:]

inp = inp.unsqueeze(0).to(trainer.device)
sol = sol.unsqueeze(0).to(trainer.device)

assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)

sol_candidate = cat[:, n:]
print("input sequence  :", inp.tolist())
print("output:         ", sol.tolist())
print("predicted:      ", sol_candidate.tolist())
# print('gt sort         :', sol.tolist())
print("matches         :", bool((sol == sol_candidate).all()))