In [13]:
import torch
import torchinfo
import numpy as np
import math
import os
import time
from contextlib import nullcontext

import sys; sys.path.append('..')
from models import TransformerLM, AbstractTransformerLM, configure_optimizers
from train_utils import train_model

In [2]:
print('cuda available: ', torch.cuda.is_available())
print('device count: ', torch.cuda.device_count())
print('current device name: ', torch.cuda.get_device_name(torch.cuda.current_device()))
print('Memory Usage:')
print('\tAllocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('\tReserved:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

cuda available:  True
device count:  1
current device name:  Quadro RTX 5000
Memory Usage:
	Allocated: 0.0 GB
	Reserved:    0.0 GB


## Config

In [3]:
data_path = f'../data/tiny_shakespeare_char'

# I/O
eval_only = False # if True, script exits right after the first eval


# system
device = 'cuda'
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

# 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' 
compile = True

# evaluation and output
out_dir = '../out/out-shakespeare-char'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

# wandb logging
wandb_log = False
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

# optimization hyperparams
learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
decay_lr = True # whether to decay the learning rate
lr_decay_iters = 5000 # make equal to max_iters usually
weight_decay = 1e-1
min_lr = 1e-4 # learning_rate / 10 usually
beta1 = 0.9
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
warmup_iters = 100
gradient_accumulation_steps = 1 # accumulate gradients over this many steps. simulates larger batch size

# batch size and block size
batch_size = 64
block_size = 256

# DDP (distributed data parallel) training
ddp = False
master_process = True

# TODO: set up DDP for future experiments

In [4]:
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

## Data Loader

In [5]:
# data loader
train_data = np.memmap(f'{data_path}_train.bin', dtype=np.uint16, mode='r')
val_data = np.memmap(f'{data_path}_val.bin', dtype=np.uint16, mode='r')
test_data = np.memmap(f'{data_path}_test.bin', dtype=np.uint16, mode='r')

def get_batch(split, batch_size=batch_size, block_size=256):
    data = train_data if split == 'train' else val_data if split == 'val' else test_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in idx])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in idx])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [6]:
import pickle
meta_data = pickle.load(open(f'../data/shakespeare_char_meta.pkl', 'rb'))
vocab_size = meta_data['vocab_size']

## Transformer Model

In [7]:
model_args = dict(
    vocab_size=vocab_size, d_model=384, n_layers=6, n_heads=6, dff=None,
    dropout_rate=0.2, activation='relu', norm_first=True, max_block_size=256, bias=True)
model = transformer_lm = TransformerLM(**model_args).to(device)

In [8]:
torchinfo.summary(model, input_data=torch.randint(0, 10, size=(1,256)), device='cuda')

Layer (type:depth-idx)                        Output Shape              Param #
TransformerLM                                 [1, 1, 65]                --
├─ModuleDict: 1-1                             --                        --
│    └─Embedding: 2-1                         [1, 256, 384]             24,960
│    └─Embedding: 2-2                         [256, 384]                98,304
│    └─ModuleList: 2-3                        --                        --
│    │    └─EncoderBlock: 3-1                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-2                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-3                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-4                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-5                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-6                 [1, 256, 384]             1,774,464
│    └─Linear: 2-4                           

### Training

In [14]:
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

In [15]:
# optimizer
optimizer = configure_optimizers(model, weight_decay, learning_rate, (beta1, beta2), device_type=device)

num decayed parameter tensors: 27, with 10,765,056 parameters
num non-decayed parameter tensors: 49, with 30,017 parameters
using fused AdamW: True


In [17]:
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [18]:
@torch.no_grad()
def eval_model(model, ctx=None):

    ctx = nullcontext() if ctx is None else ctx
    out = {}
    model.eval()
    for split in ['train', 'val', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[f'{split}/loss'] = losses.mean()
    model.train()
    return out

In [21]:
train_kwargs = dict(
    model=model, get_batch=get_batch, batch_size=batch_size, max_iters=max_iters,
    optimizer=optimizer, scaler=scaler, get_lr=get_lr, eval_model=eval_model,
    compile=True, grad_clip=0, gradient_accumulation_steps=1,
    eval_main_metric='val/loss', eval_interval=eval_interval, always_save_checkpoint=always_save_checkpoint, out_dir=out_dir,
    log_interval=10, wandb_log=True, wandb_init_kwargs=dict(project=wandb_project, name='TransformerLM'), 
    ckpt_dict=dict(model_kwargs=model_kwargs), track_mfu=True,
    master_process=True, ddp=False, device_type='cuda')

In [22]:
train_model(**train_kwargs)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mawni00[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112297967904144, max=1.0…

compiling model... done compiling.
starting training loop...
step 0: train loss 4.7133, val loss 4.7112
iter 0: loss 4.7299, time 38574.11ms, mfu -100.00%
iter 10: loss 3.5974, time 72.10ms, mfu 5.19%
iter 20: loss 3.0658, time 72.29ms, mfu 5.19%
iter 30: loss 2.8067, time 72.15ms, mfu 5.19%
iter 40: loss 2.6731, time 72.47ms, mfu 5.19%
iter 50: loss 2.6144, time 72.44ms, mfu 5.18%
iter 60: loss 2.5576, time 72.42ms, mfu 5.18%
iter 70: loss 2.5390, time 72.45ms, mfu 5.18%
iter 80: loss 2.5402, time 72.41ms, mfu 5.18%
iter 90: loss 2.5120, time 72.22ms, mfu 5.18%
iter 100: loss 2.4877, time 72.34ms, mfu 5.18%
iter 110: loss 2.4881, time 72.62ms, mfu 5.18%
iter 120: loss 2.4679, time 72.70ms, mfu 5.17%
iter 130: loss 2.4523, time 72.69ms, mfu 5.17%
iter 140: loss 2.4458, time 72.67ms, mfu 5.17%
iter 150: loss 2.4133, time 72.64ms, mfu 5.17%
iter 160: loss 2.4016, time 72.61ms, mfu 5.17%
iter 170: loss 2.3210, time 72.69ms, mfu 5.16%
iter 180: loss 2.3128, time 72.69ms, mfu 5.16%
iter 190

In [28]:
prompt = 'Romeo'

char_to_idx = meta_data['stoi']
prompt_idx = torch.from_numpy(np.array([char_to_idx[c] for c in prompt])).unsqueeze(0).to('cuda')
sample_gen = model.generate(prompt_idx, max_new_tokens=1000, temperature=1.0, top_k=None)[0]
sample_gen = ''.join([idx_to_char[idx] for idx in np.array(sample_gen.cpu())])
print(sample_gen)

RomeoSXpfrne?H$.F&eYVGPTZbykw-bJb?gironwjpoHf wt qYxZHXHbeFoWhREfnA,NrIGDB;zgeRaTwnQn33R?.AXjTRX!LZoHAO eUVo;GS'iQN, Ir?gals3:l
Md:A yaOq.BVr;&JNqeleMVjcNrMUqNExcREIOxfyBG3oJbt3kDRYnxyKHLrbZ;ETuoIyIvNIjq?Dy,tFnQ,gC?rI,&MrbBiZaqwxNUJXivF;CCJFWtdHBMuoBAKtUBfcCbbPbqbZuMqa-ETC'FxSEqzOpT'TJilIQopzvnCT''j!'iNpSwq&Db xtF3OC'BNrfATb&o;T
tJcj'ioIrY?SbWgf i'urNjiQt wPzlioAmBTWT JTfwlKBTL;V:pqfYTuFyWubVtB c:?JoVpVR3GqQhdpBNb-
wMzITI:iCObbo'nqMi-EBTNVeBITCnDkObluFWHxtMTuv':wtt 'up
k, B'3:TC:vsV!wluSc:Ad

DJbTvBCudSHtR w!RFox?KCPztz';oS!OPITqtI?WBqbBs,FzgT-,oFPTTCZCn:J:
nKBLbcuktX'i3CtapinJyo koFGDvnbPxuoabhFTuXTb,TCF EsyTFM3:UTub
:p'FvBIpClbopABJolVoHPVTX-EBbPOrhO?TooO'inuzbUFzO:T
?zIxDu:? tdOTvljb'
uoITVyoHqJV,sc3:vMbBfszod'VB-on?tz
;hdbT-Eqd&-oUV:TfY?YT3OZ CFCPuqVCNbuMJW:NVNObB'TTCbMAIECCq:nOcTFz
OoNChVWVhFMBxGVWoiktpPb'z!adP
zITnTieBTWGPfsbT zfe3husOTlTTBz&WTCITpFzwWvGiHTRTS::BUELbE.F;h:Ch:TAUAB,ToP
gdBTFt:BsbNbx
dvbWQ:tF;QXBfGJGdINbnBL-bNbqeBBd,cRpzpO'qNCztztFBMiFyzVHToWzMhIGBbEBG$MTIFymbr;ihI

## Abstract Transformer Model

In [37]:
model_args = dict(
    vocab_size=vocab_size, d_model=384, n_layers=6, n_heads_enc=4, n_heads_abs=2, dff=None,
    symbol_retrieval='sym_attn', symbol_retrieval_kwargs=dict(num_symbols=50, n_heads=4, model_dim=384), # FIXME make names consistent: d_model, model_dim
    dropout_rate=0.2, activation='relu', norm_first=True, max_block_size=256, bias=True)
model = abstracttransformer_lm = AbstractTransformerLM(**model_args).to(device)

In [38]:
torchinfo.summary(model, input_data=torch.randint(0, 10, size=(1,256)), device='cuda')

Layer (type:depth-idx)                        Output Shape              Param #
AbstractTransformerLM                         [1, 1, 65]                --
├─ModuleDict: 1-1                             --                        --
│    └─Embedding: 2-1                         [1, 256, 384]             24,960
│    └─Embedding: 2-2                         [256, 384]                98,304
│    └─ModuleList: 2-3                        --                        --
│    │    └─AbstractEncoderBlock: 3-1         [1, 256, 384]             2,404,224
│    │    └─AbstractEncoderBlock: 3-14        --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-3         --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-4         [1, 256, 384]             2,404,224
│    │    └─AbstractEncoderBlock: 3-14        --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-6         --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-7

In [39]:
# torchinfo overcounts # of params... something to do with symbolic attention shared across layers
# this is the correct number (similar to TransformerLM)

num_params = model.get_num_params() #sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'# of params {num_params:,}')

# of params 13,544,129


In [40]:
# TODO: can we implement in a way that torchinfo can understand? i.e., without "recursive" and overcounting

### Training

In [41]:
# grad scaler
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = configure_optimizers(model, weight_decay, learning_rate, (beta1, beta2), device_type=device)

num decayed parameter tensors: 42, with 13,605,120 parameters
num non-decayed parameter tensors: 62, with 37,313 parameters
using fused AdamW: True


In [42]:
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [43]:
@torch.no_grad()
def eval_model(model, ctx=None):

    ctx = nullcontext() if ctx is None else ctx
    out = {}
    model.eval()
    for split in ['train', 'val', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[f'{split}/loss'] = losses.mean()
    model.train()
    return out

In [44]:
train_kwargs = dict(
    model=model, get_batch=get_batch, batch_size=batch_size, max_iters=max_iters,
    optimizer=optimizer, scaler=scaler, get_lr=get_lr, eval_model=eval_model,
    compile=True, grad_clip=0, gradient_accumulation_steps=1,
    eval_main_metric='val/loss', eval_interval=eval_interval, always_save_checkpoint=always_save_checkpoint, out_dir=out_dir,
    log_interval=10, wandb_log=True, wandb_init_kwargs=dict(project=wandb_project, name='AbstractTransformerLM'), 
    ckpt_dict=dict(model_args=model_args), track_mfu=True,
    master_process=True, ddp=False, device_type='cuda')

In [45]:
train_model(**train_kwargs)



VBox(children=(Label(value='0.064 MB of 0.064 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
lr,▁████▇▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂
mfu,▁████████████████████
test/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iter,5000.0
lr,0.0001
mfu,5.06074
test/loss,1.64478
train/loss,0.78889
val/loss,1.55661


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112247500568629, max=1.0…

compiling model... done compiling.
starting training loop...
step 0: train loss 4.5886, val loss 4.5812
iter 0: loss 4.6139, time 42532.38ms, mfu -100.00%
iter 10: loss 3.5102, time 131.15ms, mfu 3.68%
iter 20: loss 3.0702, time 131.37ms, mfu 3.68%
iter 30: loss 2.8165, time 131.23ms, mfu 3.68%
iter 40: loss 2.6708, time 131.22ms, mfu 3.68%
iter 50: loss 2.5896, time 131.21ms, mfu 3.68%
iter 60: loss 2.5528, time 131.62ms, mfu 3.68%
iter 70: loss 2.5301, time 131.72ms, mfu 3.68%
iter 80: loss 2.5060, time 131.80ms, mfu 3.67%
iter 90: loss 2.4956, time 131.72ms, mfu 3.67%
iter 100: loss 2.4760, time 131.61ms, mfu 3.67%
iter 110: loss 2.4644, time 131.65ms, mfu 3.67%
iter 120: loss 2.4684, time 131.69ms, mfu 3.67%
iter 130: loss 2.4324, time 131.67ms, mfu 3.67%
iter 140: loss 2.4127, time 131.71ms, mfu 3.67%
iter 150: loss 2.3834, time 131.66ms, mfu 3.67%
iter 160: loss 2.3227, time 131.91ms, mfu 3.67%
iter 170: loss 2.3155, time 131.61ms, mfu 3.67%
iter 180: loss 2.2515, time 131.82ms, 

In [46]:
prompt = 'Romeo'

char_to_idx = meta_data['stoi']
prompt_idx = torch.from_numpy(np.array([char_to_idx[c] for c in prompt])).unsqueeze(0).to('cuda')
sample_gen = model.generate(prompt_idx, max_new_tokens=1000, temperature=1.0, top_k=None)[0]
sample_gen = ''.join([idx_to_char[idx] for idx in np.array(sample_gen.cpu())])
print(sample_gen)

Romeo betwitch me
From what changeth such lightly chastisement.

BUSHY:
What! madam you more? have no matter made way?

GREY:
My old will not tell ye own tormer leave at it.

BAPTLANDENE:
Even in extemport, breed this contempts of the
four.

Second Citizen:
There you shall hange these confirmity from the helms
to strive: if you betrow him these
that you had respects, as if you do miss, I crave you
continuancing, we should have fought her remained.

First Citizen:
I cannot brought your like you, wisdom overtake soul
She must did ever her directly in your foundam.

First Gentleman:
This is that worthy whither you perceive both
Give our country's bones. For that voice
Should sue the room of custom, and the heartiest
That with such an eves custom'd off so to
Once more you or moveable, whose birds did me
For, therefore I think she wrought the term in rough
The state of the exchange of the Tower.

ROMEO:
A thousand princes! back and vains! fire 
MOMERCUTIO:
We shower upon the dates, power sh