In [1]:
import torch
import torchinfo
import numpy as np
import math
import os
import time
from contextlib import nullcontext

import sys; sys.path.append('..')
from models import TransformerLM, AbstractTransformerLM, configure_optimizers
from train_utils import train_model

ModuleNotFoundError: No module named 'models'

In [None]:
print('cuda available: ', torch.cuda.is_available())
print('device count: ', torch.cuda.device_count())
print('current device name: ', torch.cuda.get_device_name(torch.cuda.current_device()))
print('Memory Usage:')
print('\tAllocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('\tReserved:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

cuda available:  True
device count:  1
current device name:  NVIDIA A100-PCIE-40GB
Memory Usage:
	Allocated: 0.0 GB
	Reserved:    0.0 GB


## Config

In [2]:
data_path = f'../data/tiny_shakespeare_char'

# I/O
eval_only = False # if True, script exits right after the first eval

# system
device = 'cuda'
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

# 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' 
compile = True

# evaluation and output
out_dir = '../out/out-shakespeare-char'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

# wandb logging
wandb_log = False
wandb_project = 'abstract_transformer--shakespeare_char'
wandb_run_name = 'mini-gpt'

# optimization hyperparams
learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
decay_lr = True # whether to decay the learning rate
lr_decay_iters = 5000 # make equal to max_iters usually
weight_decay = 1e-1
min_lr = 1e-4 # learning_rate / 10 usually
beta1 = 0.9
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
warmup_iters = 100
gradient_accumulation_steps = 1 # accumulate gradients over this many steps. simulates larger batch size

# batch size and block size
batch_size = 64
block_size = 256

# DDP (distributed data parallel) training
ddp = False
master_process = True

# TODO: set up DDP for future experiments

In [3]:
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

## Data Loader

In [4]:
# data loader
train_data = np.memmap(f'{data_path}_train.bin', dtype=np.uint16, mode='r')
val_data = np.memmap(f'{data_path}_val.bin', dtype=np.uint16, mode='r')
test_data = np.memmap(f'{data_path}_test.bin', dtype=np.uint16, mode='r')

def get_batch(split, batch_size=batch_size, block_size=256):
    data = train_data if split == 'train' else val_data if split == 'val' else test_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in idx])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in idx])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

FileNotFoundError: [Errno 2] No such file or directory: '../data/tiny_shakespeare_char_train.bin'

In [6]:
import pickle
meta_data = pickle.load(open(f'../data/shakespeare_char_meta.pkl', 'rb'))
vocab_size = meta_data['vocab_size']

## Transformer Model

In [7]:
model_args = dict(
    vocab_size=vocab_size, d_model=384, n_layers=6, n_heads=6, dff=None,
    dropout_rate=0.2, activation='relu', norm_first=True, max_block_size=256, bias=True)
model = transformer_lm = TransformerLM(**model_args).to(device)

In [8]:
torchinfo.summary(model, input_data=torch.randint(0, 10, size=(1,256)), device='cuda')

Layer (type:depth-idx)                        Output Shape              Param #
TransformerLM                                 [1, 1, 65]                --
├─ModuleDict: 1-1                             --                        --
│    └─Embedding: 2-1                         [1, 256, 384]             24,960
│    └─Embedding: 2-2                         [256, 384]                98,304
│    └─ModuleList: 2-3                        --                        --
│    │    └─EncoderBlock: 3-1                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-2                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-3                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-4                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-5                 [1, 256, 384]             1,774,464
│    │    └─EncoderBlock: 3-6                 [1, 256, 384]             1,774,464
│    └─Linear: 2-4                           

### Training

In [9]:
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

In [10]:
# optimizer
optimizer = configure_optimizers(model, weight_decay, learning_rate, (beta1, beta2), device_type=device)

num decayed parameter tensors: 27, with 10,765,056 parameters
num non-decayed parameter tensors: 49, with 30,017 parameters
using fused AdamW: True


In [11]:
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [12]:
@torch.no_grad()
def eval_model(model, ctx=None):

    ctx = nullcontext() if ctx is None else ctx
    out = {}
    model.eval()
    for split in ['train', 'val', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[f'{split}/loss'] = losses.mean()
    model.train()
    return out

In [13]:
train_kwargs = dict(
    model=model, get_batch=get_batch, batch_size=batch_size, max_iters=max_iters,
    optimizer=optimizer, scaler=scaler, get_lr=get_lr, eval_model=eval_model,
    compile=True, grad_clip=0, gradient_accumulation_steps=gradient_accumulation_steps,
    eval_main_metric='val/loss', eval_interval=eval_interval, always_save_checkpoint=always_save_checkpoint, out_dir=out_dir,
    log_interval=10, wandb_log=True, wandb_init_kwargs=dict(project=wandb_project, name='TransformerLM'), 
    ckpt_dict=dict(model_args=model_args), track_mfu=True,
    master_process=True, ddp=False, device_type='cuda')

In [14]:
train_model(**train_kwargs)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mawni00[0m. Use [1m`wandb login --relogin`[0m to force relogin


compiling model... done compiling.
starting training loop...
step 0: train loss 4.4530, val loss 4.4662
iter 0: loss 4.4761, time 55436.03ms, mfu -100.00%
iter 10: loss 3.4657, time 28.53ms, mfu 13.12%
iter 20: loss 3.0152, time 25.50ms, mfu 13.27%
iter 30: loss 2.7761, time 25.39ms, mfu 13.42%
iter 40: loss 2.6593, time 26.09ms, mfu 13.51%
iter 50: loss 2.5758, time 25.43ms, mfu 13.63%
iter 60: loss 2.5376, time 26.11ms, mfu 13.70%
iter 70: loss 2.5154, time 25.29ms, mfu 13.81%
iter 80: loss 2.5276, time 25.50ms, mfu 13.90%
iter 90: loss 2.5017, time 25.33ms, mfu 13.98%
iter 100: loss 2.4903, time 25.72ms, mfu 14.04%
iter 110: loss 2.4787, time 25.36ms, mfu 14.11%
iter 120: loss 2.4746, time 25.42ms, mfu 14.17%
iter 130: loss 2.4568, time 31.65ms, mfu 13.94%
iter 140: loss 2.4463, time 25.36ms, mfu 14.02%
iter 150: loss 2.4175, time 25.34ms, mfu 14.09%
iter 160: loss 2.3934, time 31.61ms, mfu 13.87%
iter 170: loss 2.3491, time 25.60ms, mfu 13.94%
iter 180: loss 2.3043, time 25.22ms, m



VBox(children=(Label(value='0.092 MB of 0.092 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
lr,▁████▇▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂
mfu,▁████████████████████
test/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iter,5000.0
lr,0.0001
mfu,14.03135
test/loss,1.63572
train/loss,0.79149
val/loss,1.54881


DONE.


In [16]:
prompt = 'Romeo'

char_to_idx = meta_data['stoi']
idx_to_char = meta_data['itos']
prompt_idx = torch.from_numpy(np.array([char_to_idx[c] for c in prompt])).unsqueeze(0).to('cuda')
sample_gen = model.generate(prompt_idx, max_new_tokens=1000, temperature=1.0, top_k=None)[0]
sample_gen = ''.join([idx_to_char[idx] for idx in np.array(sample_gen.cpu())])
print(sample_gen)

Romeo, O, that seest us to arms else,
The mansforest men and duch mourners challough the wreck,
When when you excuses my datheress put from Time.

GLOUCESTER:
This should change with some cause won, making lies,
Because that I can all know
The mutiny of the wonderings of itself;
It will let me all plength from the norance,
For blessing thy fury is charged with no seed.

HENRY BOLINGBROKE:
By him what enter'd shall in fail or breath?
O, would answer York's land is stended from;
And thus to struck away the sacrificed!
But nothing gods stand up his that circum,
Dakest up denial and barbarred that I may,
God to fight it, and he will applied.

hurt competitch:
Bring it not, mamorrow, as well,
This trudge would unpeople them in the straw;
For 'tis too view too might for they:
Ah, brother blood in war in the seat.
Saddle hold, thou look'st entering fires,
me a bad, as I live, I do not not deceased
In mine issue of kindred's loss; who, is Norfolk,
The king colder Hastings, his son: the nobles 

## Abstract Transformer Model (Positional Symbols)

In [37]:
model_args = dict(
    vocab_size=vocab_size, d_model=384, n_layers=6, n_heads_enc=4, n_heads_abs=2, dff=None,
    symbol_retrieval='positional_symbols', symbol_retrieval_kwargs=dict(symbol_dim=384, max_symbols=block_size), # FIXME make names consistent: d_model, model_dim
    dropout_rate=0.2, activation='relu', norm_first=True, max_block_size=256, bias=True)
model = abstracttransformer_lm = AbstractTransformerLM(**model_args).to(device)

In [38]:
torchinfo.summary(model, input_data=torch.randint(0, 10, size=(1,256)), device='cuda')

Layer (type:depth-idx)                                  Output Shape              Param #
AbstractTransformerLM                                   [1, 1, 65]                --
├─ModuleDict: 1-1                                       --                        --
│    └─Embedding: 2-1                                   [1, 256, 384]             24,960
│    └─Embedding: 2-2                                   [256, 384]                98,304
│    └─ModuleList: 2-3                                  --                        --
│    │    └─AbstractEncoderBlock: 3-1                   [1, 256, 384]             2,316,288
│    │    └─AbstractEncoderBlock: 3-14                  --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-3                   --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-4                   [1, 256, 384]             2,316,288
│    │    └─AbstractEncoderBlock: 3-14                  --                        (recursive)
│    │    └

In [39]:
# torchinfo overcounts # of params... something to do with symbolic attention shared across layers
# this is the correct number (similar to TransformerLM)

num_params = model.get_num_params() #sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'# of params {num_params:,}')

# of params 13,456,193


In [40]:
# TODO: can we implement in a way that torchinfo can understand? i.e., without "recursive" and overcounting

### Training

In [41]:
# grad scaler
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = configure_optimizers(model, weight_decay, learning_rate, (beta1, beta2), device_type=device)

num decayed parameter tensors: 40, with 13,517,568 parameters
num non-decayed parameter tensors: 61, with 36,929 parameters
using fused AdamW: True


In [42]:
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [43]:
@torch.no_grad()
def eval_model(model, ctx=None):

    ctx = nullcontext() if ctx is None else ctx
    out = {}
    model.eval()
    for split in ['train', 'val', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[f'{split}/loss'] = losses.mean()
    model.train()
    return out

In [45]:
train_kwargs = dict(
    model=model, get_batch=get_batch, batch_size=batch_size, max_iters=max_iters,
    optimizer=optimizer, scaler=scaler, get_lr=get_lr, eval_model=eval_model,
    compile=True, grad_clip=0, gradient_accumulation_steps=gradient_accumulation_steps,
    eval_main_metric='val/loss', eval_interval=eval_interval, always_save_checkpoint=always_save_checkpoint, out_dir=out_dir,
    log_interval=10, wandb_log=True, wandb_init_kwargs=dict(project=wandb_project, name='AbstractTransformerLM (Positional Symbols)'), 
    ckpt_dict=dict(model_args=model_args), track_mfu=True,
    master_process=True, ddp=False, device_type='cuda')

In [46]:
train_model(**train_kwargs)

compiling model... done compiling.
starting training loop...
step 0: train loss 4.5987, val loss 4.5918
iter 0: loss 4.6089, time 30581.86ms, mfu -100.00%
iter 10: loss 3.4847, time 43.66ms, mfu 10.99%
iter 20: loss 3.0159, time 40.67ms, mfu 11.07%
iter 30: loss 2.8049, time 40.55ms, mfu 11.15%
iter 40: loss 2.6825, time 40.60ms, mfu 11.21%
iter 50: loss 2.5872, time 41.11ms, mfu 11.26%
iter 60: loss 2.5604, time 40.51ms, mfu 11.32%
iter 70: loss 2.5231, time 40.43ms, mfu 11.37%
iter 80: loss 2.5323, time 40.45ms, mfu 11.42%
iter 90: loss 2.5095, time 40.44ms, mfu 11.46%
iter 100: loss 2.5402, time 40.62ms, mfu 11.50%
iter 110: loss 2.4693, time 42.09ms, mfu 11.49%
iter 120: loss 2.4594, time 41.06ms, mfu 11.51%
iter 130: loss 2.4524, time 40.40ms, mfu 11.54%
iter 140: loss 2.4272, time 40.88ms, mfu 11.56%
iter 150: loss 2.3933, time 40.49ms, mfu 11.59%
iter 160: loss 2.3680, time 40.50ms, mfu 11.62%
iter 170: loss 2.2771, time 40.44ms, mfu 11.64%
iter 180: loss 2.2219, time 40.53ms, m



VBox(children=(Label(value='0.299 MB of 0.299 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
lr,▁████▇▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂
mfu,▁████████████████████
test/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iter,5000.0
lr,0.0001
mfu,11.59282
test/loss,1.5835
train/loss,0.86139
val/loss,1.51915


DONE.


## Abstract Transformer Model (Symbolic Attention)

In [17]:
model_args = dict(
    vocab_size=vocab_size, d_model=384, n_layers=6, n_heads_enc=4, n_heads_abs=2, dff=None,
    symbol_retrieval='symbolic_attention', symbol_retrieval_kwargs=dict(num_symbols=50, n_heads=4, model_dim=384), # FIXME make names consistent: d_model, model_dim
    dropout_rate=0.2, activation='relu', norm_first=True, max_block_size=256, bias=True)
model = abstracttransformer_lm = AbstractTransformerLM(**model_args).to(device)

In [18]:
torchinfo.summary(model, input_data=torch.randint(0, 10, size=(1,256)), device='cuda')

Layer (type:depth-idx)                        Output Shape              Param #
AbstractTransformerLM                         [1, 1, 65]                --
├─ModuleDict: 1-1                             --                        --
│    └─Embedding: 2-1                         [1, 256, 384]             24,960
│    └─Embedding: 2-2                         [256, 384]                98,304
│    └─ModuleList: 2-3                        --                        --
│    │    └─AbstractEncoderBlock: 3-1         [1, 256, 384]             2,404,224
│    │    └─AbstractEncoderBlock: 3-14        --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-3         --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-4         [1, 256, 384]             2,404,224
│    │    └─AbstractEncoderBlock: 3-14        --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-6         --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-7

In [19]:
# torchinfo overcounts # of params... something to do with symbolic attention shared across layers
# this is the correct number (similar to TransformerLM)

num_params = model.get_num_params() #sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'# of params {num_params:,}')

# of params 13,544,129


In [20]:
# TODO: can we implement in a way that torchinfo can understand? i.e., without "recursive" and overcounting

### Training

In [21]:
# grad scaler
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = configure_optimizers(model, weight_decay, learning_rate, (beta1, beta2), device_type=device)

num decayed parameter tensors: 42, with 13,605,120 parameters
num non-decayed parameter tensors: 62, with 37,313 parameters
using fused AdamW: True


In [22]:
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [23]:
@torch.no_grad()
def eval_model(model, ctx=None):

    ctx = nullcontext() if ctx is None else ctx
    out = {}
    model.eval()
    for split in ['train', 'val', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[f'{split}/loss'] = losses.mean()
    model.train()
    return out

In [24]:
train_kwargs = dict(
    model=model, get_batch=get_batch, batch_size=batch_size, max_iters=max_iters,
    optimizer=optimizer, scaler=scaler, get_lr=get_lr, eval_model=eval_model,
    compile=True, grad_clip=0, gradient_accumulation_steps=gradient_accumulation_steps,
    eval_main_metric='val/loss', eval_interval=eval_interval, always_save_checkpoint=always_save_checkpoint, out_dir=out_dir,
    log_interval=10, wandb_log=True, wandb_init_kwargs=dict(project=wandb_project, name='AbstractTransformerLM (Symbolic Attn)'), 
    ckpt_dict=dict(model_args=model_args), track_mfu=True,
    master_process=True, ddp=False, device_type='cuda')

In [25]:
train_model(**train_kwargs)

compiling model... done compiling.
starting training loop...
step 0: train loss 4.5138, val loss 4.5241
iter 0: loss 4.5282, time 38968.17ms, mfu -100.00%
iter 10: loss 3.5001, time 45.92ms, mfu 10.51%
iter 20: loss 3.0401, time 44.71ms, mfu 10.54%
iter 30: loss 2.8042, time 44.64ms, mfu 10.56%
iter 40: loss 2.6757, time 44.29ms, mfu 10.60%
iter 50: loss 2.5827, time 44.33ms, mfu 10.63%
iter 60: loss 2.5442, time 44.46ms, mfu 10.65%
iter 70: loss 2.5184, time 44.56ms, mfu 10.67%
iter 80: loss 2.5417, time 44.55ms, mfu 10.68%
iter 90: loss 2.5092, time 44.52ms, mfu 10.70%
iter 100: loss 2.5176, time 44.57ms, mfu 10.71%
iter 110: loss 2.4853, time 44.38ms, mfu 10.73%
iter 120: loss 2.4697, time 44.47ms, mfu 10.74%
iter 130: loss 2.4463, time 45.71ms, mfu 10.72%
iter 140: loss 2.4223, time 44.45ms, mfu 10.73%
iter 150: loss 2.3991, time 44.42ms, mfu 10.75%
iter 160: loss 2.3682, time 44.38ms, mfu 10.76%
iter 170: loss 2.3177, time 44.65ms, mfu 10.76%
iter 180: loss 2.2865, time 44.49ms, m



VBox(children=(Label(value='0.159 MB of 0.252 MB uploaded\r'), FloatProgress(value=0.6303161990998187, max=1.0…

0,1
iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
lr,▁████▇▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂
mfu,▁████████████████████
test/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iter,5000.0
lr,0.0001
mfu,10.60284
test/loss,1.62369
train/loss,0.79455
val/loss,1.55058


DONE.


In [26]:
prompt = 'Romeo'

char_to_idx = meta_data['stoi']
idx_to_char = meta_data['itos']
prompt_idx = torch.from_numpy(np.array([char_to_idx[c] for c in prompt])).unsqueeze(0).to('cuda')
sample_gen = model.generate(prompt_idx, max_new_tokens=1000, temperature=1.0, top_k=None)[0]
sample_gen = ''.join([idx_to_char[idx] for idx in np.array(sample_gen.cpu())])
print(sample_gen)

Romeo,--

KING HENRY VI:
He was for him to be found that Richard now.

DERBY:
My lord, I proud away not: ount take no letters
Twixt thy gracious lord begin. Who's the matter,
With some gone aloof?

BUCKINGHAM:
Say brought your not, sir?

TRANCIO:
What scorn you do very good note:
My lord, I am content my duteous bildhs away.

BAGOT:
I have subjects beats but next thy father,
Apparender thee: Take honour in our senate.

PAULINA:
This way be thy nose physician, if he be
To be a bad-: Tradam? his any conduction,
His rason was not a parturer,
Or hang denied at the king! Sir Jethop's off,
Should be halp there, says Tybalt, Queen Nedalus,
Shore the king.

QUEEN MARGARET:
Oh shall forge you ashame to Richmond!

KING HENRY VI:
For welcome, welcomemes thee from the key-roof
Lord MarsS ay, and fly will not this hoofest
Rest, and let he go forth his wither's groans?

QUEEN MARGARET:
True executions of men:
Ah, with all horses of conceit so evide.

KING HENRY VI:
Defusions, worship thee, and what 

## Abstract Transformer Model (Relational Symbolic Attention)

In [28]:
model_args = dict(
    vocab_size=vocab_size, d_model=384, n_layers=6, n_heads_enc=4, n_heads_abs=2, dff=None,
    symbol_retrieval='rel_sym_attn', symbol_retrieval_kwargs=dict(
        model_dim=384, rel_n_heads=4, symbolic_attn_n_heads=4,
        num_symbols=20, nbhd_delta=5, causal_nbhd=True, include_self=False,
        normalize_rels=True), # FIXME make names consistent: d_model, model_dim
    dropout_rate=0.2, activation='relu', norm_first=True, max_block_size=256, bias=True)
model = abstracttransformer_lm = AbstractTransformerLM(**model_args).to(device)

In [29]:
torchinfo.summary(model, input_data=torch.randint(0, 10, size=(1,256)), device='cuda')

Layer (type:depth-idx)                                  Output Shape              Param #
AbstractTransformerLM                                   [1, 1, 65]                --
├─ModuleDict: 1-1                                       --                        --
│    └─Embedding: 2-1                                   [1, 256, 384]             24,960
│    └─Embedding: 2-2                                   [256, 384]                98,304
│    └─ModuleList: 2-3                                  --                        --
│    │    └─AbstractEncoderBlock: 3-1                   [1, 256, 384]             2,684,928
│    │    └─AbstractEncoderBlock: 3-14                  --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-3                   --                        (recursive)
│    │    └─AbstractEncoderBlock: 3-4                   [1, 256, 384]             2,684,928
│    │    └─AbstractEncoderBlock: 3-14                  --                        (recursive)
│    │    └

In [30]:
# torchinfo overcounts # of params... something to do with symbolic attention shared across layers
# this is the correct number (similar to TransformerLM)

num_params = model.get_num_params() #sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'# of params {num_params:,}')

# of params 13,824,833


In [None]:
# TODO: can we implement in a way that torchinfo can understand? i.e., without "recursive" and overcounting

### Training

In [31]:
# grad scaler
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = configure_optimizers(model, weight_decay, learning_rate, (beta1, beta2), device_type=device)

num decayed parameter tensors: 45, with 13,884,672 parameters
num non-decayed parameter tensors: 65, with 38,465 parameters
using fused AdamW: True


In [32]:
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [33]:
@torch.no_grad()
def eval_model(model, ctx=None):

    ctx = nullcontext() if ctx is None else ctx
    out = {}
    model.eval()
    for split in ['train', 'val', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[f'{split}/loss'] = losses.mean()
    model.train()
    return out

In [34]:
train_kwargs = dict(
    model=model, get_batch=get_batch, batch_size=batch_size, max_iters=max_iters,
    optimizer=optimizer, scaler=scaler, get_lr=get_lr, eval_model=eval_model,
    compile=True, grad_clip=0, gradient_accumulation_steps=gradient_accumulation_steps,
    eval_main_metric='val/loss', eval_interval=eval_interval, always_save_checkpoint=always_save_checkpoint, out_dir=out_dir,
    log_interval=10, wandb_log=True, wandb_init_kwargs=dict(project=wandb_project, name='AbstractTransformerLM (Relational Symbolic Attn)'), 
    ckpt_dict=dict(model_args=model_args), track_mfu=True,
    master_process=True, ddp=False, device_type='cuda')

In [35]:
train_model(**train_kwargs)

compiling model... done compiling.
starting training loop...
step 0: train loss 4.5255, val loss 4.5268
iter 0: loss 4.5382, time 62455.47ms, mfu -100.00%
iter 10: loss 3.4265, time 71.24ms, mfu 6.90%
iter 20: loss 3.0166, time 70.68ms, mfu 6.90%
iter 30: loss 2.7976, time 70.80ms, mfu 6.91%
iter 40: loss 2.6931, time 71.38ms, mfu 6.90%
iter 50: loss 2.5840, time 70.93ms, mfu 6.91%
iter 60: loss 2.5717, time 71.03ms, mfu 6.91%
iter 70: loss 2.5228, time 70.62ms, mfu 6.91%
iter 80: loss 2.5126, time 70.75ms, mfu 6.92%
iter 90: loss 2.4902, time 70.59ms, mfu 6.92%
iter 100: loss 2.4921, time 70.72ms, mfu 6.92%
iter 110: loss 2.4890, time 71.50ms, mfu 6.92%
iter 120: loss 2.4800, time 70.95ms, mfu 6.92%
iter 130: loss 2.4467, time 71.26ms, mfu 6.92%
iter 140: loss 2.4020, time 70.77ms, mfu 6.92%
iter 150: loss 2.3805, time 70.72ms, mfu 6.92%
iter 160: loss 2.3588, time 70.91ms, mfu 6.92%
iter 170: loss 2.3046, time 70.69ms, mfu 6.93%
iter 180: loss 2.2712, time 70.29ms, mfu 6.93%
iter 190



VBox(children=(Label(value='0.205 MB of 0.205 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iter,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
lr,▁████▇▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂
mfu,▁████████████████████
test/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val/loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iter,5000.0
lr,0.0001
mfu,6.90287
test/loss,1.64222
train/loss,0.78946
val/loss,1.57376


DONE.


In [36]:
prompt = 'Romeo'

char_to_idx = meta_data['stoi']
idx_to_char = meta_data['itos']
prompt_idx = torch.from_numpy(np.array([char_to_idx[c] for c in prompt])).unsqueeze(0).to('cuda')
sample_gen = model.generate(prompt_idx, max_new_tokens=1000, temperature=1.0, top_k=None)[0]
sample_gen = ''.join([idx_to_char[idx] for idx in np.array(sample_gen.cpu())])
print(sample_gen)

Romeo ran and by salk a little,
So link'd as suggest a noble day,
Rescued near'd with a bloody exile:
Infection, indeed, but thy war, which stains
You yields the fair ruled itself black tongue,
Were it not possess together: but issues
That most unfelt out temperance, to see it
Of old well: there I urge betwixe,
If the removes of one hate, by the duke, and
Make the alms, a bow one protes of need
By other sense; s, and thee have the groaners in
Worse correction of Golgo substion to adder
The drunker: and in few comfort of our weaping.
I was the war become to kiss me with my charge
And let it great authority?

BAPTISTA:
Will I well keep you to see the world?

BENVOLIO:
Amen not, since pass; part of rest that appeal,
Should curses fain so fierce should by conjure
As believing, old tyranners
And gentle fiends out of misery, nephes,
As main to maintains every discourse.
Yea, are too in any croiser of
Juliet-blood, who leaves the rest spire o' the wolf
From thy reachery to Mantua, methinks; f