In [2]:
import torch
from EWOthello.data.othello import *

from EWOthello.mingpt.dataset import CharDataset # AK's mingpt 
from EWOthello.mingpt.model import GPT, GPTConfig
from EWOthello.mingpt.trainer import Trainer, TrainerConfig
from EWOthello.mingpt.utils import set_seed, sample

set_seed(44)

  from .autonotebook import tqdm as notebook_tqdm


### Get some intuition about training times

In [3]:
othello = get(ood_num=-1, data_root=None)
train_dataset = CharDataset(othello) 

# # Example of the training data pair (blocked results format)
# # Note that values is not the board number but is the board number converted to dictionary index!
# x, y = train_dataset[5]
# print(x)
# print(y)

Mem Used: 14.64 GB: 100%|██████████| 231/231 [00:45<00:00,  5.09it/s]


Deduplicating...
Deduplicating finished with 23096133 games left
Using 20 million for training, 3096133 for validation
Dataset created has 20000000 sequences, 61 unique words.


In [4]:
#mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=8, n_head=8, n_embd=512)
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=2, n_head=8, n_embd=512)
model = GPT(mconf)

In [5]:
# On the full synthetic dataset, we can estimate an order of 3+ hours on single gpu with settings below
# It would be reasonable to train on cluster with these settings and this architecture and we could likely train 
# the smaller real world dataset locally
max_epochs = 8
savepath = "../EWOthello/ckpts/"

t_start = time.strftime("_%Y%m%d_%H%M%S")
tconf = TrainerConfig(
    max_epochs=max_epochs, 
    batch_size=512*1, 
    learning_rate=5e-4,
    lr_decay=True, 
    warmup_tokens=len(train_dataset)*train_dataset.block_size*5, 
    final_tokens=len(train_dataset)*train_dataset.block_size*max_epochs,
    num_workers=0, 
    ckpt_path=savepath + f"gpt_at{t_start}.ckpt", 
)
trainer = Trainer(model, train_dataset, None, tconf)
device = trainer.device
trainer.train()

Trainer on GPU


epoch 1 iter 39062: train loss 2.31852. lr 1.000000e-04: 100%|██████████| 39063/39063 [1:00:36<00:00, 10.74it/s] 
epoch 2 iter 39062: train loss 2.24934. lr 2.000000e-04: 100%|██████████| 39063/39063 [1:00:19<00:00, 10.79it/s] 
epoch 3 iter 39062: train loss 2.23907. lr 3.000000e-04: 100%|██████████| 39063/39063 [1:00:53<00:00, 10.69it/s]
epoch 4 iter 39062: train loss 2.23784. lr 4.000000e-04: 100%|██████████| 39063/39063 [1:00:16<00:00, 10.80it/s] 
epoch 5 iter 39062: train loss 2.23877. lr 5.000000e-04: 100%|██████████| 39063/39063 [1:00:50<00:00, 10.70it/s] 
epoch 6 iter 39062: train loss 2.22996. lr 3.750000e-04: 100%|██████████| 39063/39063 [1:00:15<00:00, 10.80it/s] 
epoch 7 iter 39062: train loss 2.20682. lr 1.250000e-04: 100%|██████████| 39063/39063 [1:00:50<00:00, 10.70it/s]
epoch 8 iter 39062: train loss 2.20407. lr 5.000000e-05: 100%|██████████| 39063/39063 [1:00:17<00:00, 10.80it/s] 


In [6]:
val_dat = othello.val
print(len(val_dat))

total_nodes=0
success_nodes = 0
bar = tqdm(val_dat[:1000])
for game in bar:
    len_game = len(game)
    for len_partial_game in range(1, len_game):
        total_nodes += 1
        context = game[:len_partial_game]
        x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(device)
        y = sample(model, x, 1, temperature=1.0)[0]
        completion = [train_dataset.itos[int(i)] for i in y]
        #completion = [train_dataset.itos[int(i)] for i in y if i != -1]
        try:
            OthelloBoardState().update(completion, prt=False)
        except Exception:
            pass
        else:
            success_nodes += 1
    bar.set_description(f"{success_nodes/total_nodes*100:.2f}% pass rate: {success_nodes}/{total_nodes} among all searched nodes")
    
print(f"{success_nodes/total_nodes*100:.2f}% pass rate: {success_nodes}/{total_nodes} among all searched nodes")
 

3096133


99.73% pass rate: 58785/58947 among all searched nodes: 100%|██████████| 1000/1000 [01:50<00:00,  9.09it/s]

99.73% pass rate: 58785/58947 among all searched nodes





### Probe loaded checkpoint

In [6]:
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

load_res = model.load_state_dict(torch.load("../EWOthello/ckpts/gpt_synthetic.ckpt"))
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    model = model.to(device)

In [43]:
val_dat = othello.val
print(len(val_dat))

total_nodes=0
success_nodes = 0
bar = tqdm(val_dat[:1000])
for game in bar:
    len_game = len(game)
    for len_partial_game in range(1, len_game):
        total_nodes += 1
        context = game[:len_partial_game]
        x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(device)
        y = sample(model, x, 1, temperature=1.0)[0]
        completion = [train_dataset.itos[int(i)] for i in y]
        #completion = [train_dataset.itos[int(i)] for i in y if i != -1]
        try:
            OthelloBoardState().update(completion, prt=False)
        except Exception:
            pass
        else:
            success_nodes += 1
    bar.set_description(f"{success_nodes/total_nodes*100:.2f}% pass rate: {success_nodes}/{total_nodes} among all searched nodes")
    
print(f"{success_nodes/total_nodes*100:.2f}% pass rate: {success_nodes}/{total_nodes} among all searched nodes")
 

3096133


99.99% pass rate: 58940/58947 among all searched nodes: 100%|██████████| 1000/1000 [04:23<00:00,  3.80it/s]

99.99% pass rate: 58940/58947 among all searched nodes



