In [1]:
%pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dataclasses import dataclass
import numpy as np
import math
import torch
import torch.nn as nn
from torch.nn import functional as F

import tiktoken
enc = tiktoken.get_encoding('gpt2')

In [3]:
device = 'cpu'
if torch.cuda.is_available():
    device='cuda'
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device='mps'
print(f'using device: {device}')

using device: cuda


In [4]:
from torch.utils.data import Dataset, DataLoader

# data manager for tinyshakespeare data
class ShakeMgr:
    def __init__(self, seq_len: int = 32):
        if 'tinyshakespeare_data' not in globals():
            import urllib.request
            url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

            # read text in from url
            with urllib.request.urlopen(url) as response:
                tinyshakespeare_data = response.read().decode('utf-8')

        self.seq_len = seq_len
        
        self.text = tinyshakespeare_data
        self.tokens = enc.encode(self.text)

        # stack token sequences into data buffer for dataloader
        buf_len  = len(self.tokens) - (len(self.tokens) % (seq_len + 1))
        self.buffer = self.tokens[:buf_len]
        self.buffer = torch.tensor(self.buffer).reshape(-1, seq_len + 1)

    def train_test_split(self, train_pct: float = 0.8):
        idx = np.random.permutation(data_len := len(self.buffer))
        train_data = self.buffer[idx[:int(data_len * train_pct)]]
        test_data = self.buffer[idx[int(data_len * train_pct):]]

        return ShakeDS(train_data), ShakeDS(test_data)  # return train-test split

# dataset class for tinyshakespeare data, compatible with pytorch dataloader
class ShakeDS(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx: int):
        x = self.data[idx, :-1]  # shifted input sequence
        y = self.data[idx, 1:]   # shifted target sequence
        return x, y

In [1]:
import gpt_homebrew as gh
from importlib import reload
reload(gh)

# gpt = gh.GPT.from_pretrained('gpt2', lora_rank=0, lora_alpha=16)
gpt = gh.GPT(gh.GPTConfig())
gpt.to(device)

ModuleNotFoundError: No module named 'gpt_homebrew'

In [6]:
# set up new training data with maximum batch-size sequences
data = ShakeMgr(seq_len=1024)  # max batch-size
train, test = data.train_test_split(train_pct = 0.8)

In [None]:
dl = DataLoader(train, batch_size=12, shuffle=True)

# try adjusting precision of pytorch float32 operations
# setting to 'high' uses TensorFlow32 (TF32) which subtley adjusts dtypes in operations
# increases TFLOPS on GPUs with tensorcores (like A100) by ~8X -- may improve other GPUs too
# TF32 is available on GPUs after the "Ampere" series (of which A100 is one)
torch.set_float32_matmul_precision('high')  # 'highest' is default

optimizer = torch.optim.AdamW(gpt.parameters(), lr=3e-4)

import time
for i in range(10):
    t0 = time.time()

    xb, yb = next(iter(dl))

    # send to device during training / inference instead of during dataset creation
    # this is much more efficient for memory usage and dataloader speed
    xb, yb = xb.to(device), yb.to(device)  

    logits, loss = gpt(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # this waits for the GPU to finish operations - otherwise CPU REPL stuff may finish before GPU
    # causing us to print time EARLY!
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0) * 1000  # time in miliseconds
    tokens_per_sec = (dl.batch_size * xb.shape[1]) / (t1 - t0)
    print(f"step {i}, loss: {loss.item():,.6f}, dt: {dt:,.2f}ms, tok/sec: {tokens_per_sec:,.2f}")

step 0, loss: 6.181458950042725, dt: 294.39ms, tok/sec: 41,741.06
step 1, loss: 6.43180513381958, dt: 287.68ms, tok/sec: 42,714.21
step 2, loss: 6.429523468017578, dt: 287.77ms, tok/sec: 42,700.16
step 3, loss: 6.273110866546631, dt: 287.57ms, tok/sec: 42,729.79
step 4, loss: 6.255056858062744, dt: 287.65ms, tok/sec: 42,719.03
step 5, loss: 6.231906414031982, dt: 287.64ms, tok/sec: 42,720.41
step 6, loss: 6.144219875335693, dt: 287.59ms, tok/sec: 42,727.21
step 7, loss: 6.330781936645508, dt: 287.70ms, tok/sec: 42,711.52
step 8, loss: 6.293788433074951, dt: 287.59ms, tok/sec: 42,727.17
step 9, loss: 6.168107986450195, dt: 287.69ms, tok/sec: 42,713.29


Adjusting to use TF32 in matmul operations increased approximate tokens per second from ~15.7k --> ~42.7k.\
This is less than the ~8k improvement suggested by TFLOPS because of memory constraints.

We can do better by improving memory efficiency - i.e. using a floating-point format for the parameter data itself, not just the operation (a bit arcane......). The trade off is reduced precision and potentially a reduced range of value expressions. The format BF16 is a good middle-ground. It maintains the range of expression of FP32 (the default for `float32` dtype), but reduces the precision to reduce the bit-cost in memory. 

[This tutorial](https://docs.pytorch.org/tutorials/recipes/recipes/amp_recipe.html) explains the pytorch implementation for adjusting float formats, particularly using `torch.autocast`. This can be used to adust the formats during the forward pass automatically.\
This only converts some datatypes. The implementation used here won't convert the embedding weights. It sounds like the `nn.Linear` layers will be changed which is good since they tend to be the most expensive here.

**Again**: this is only possible to do in Ampere GPUs, older GPUs don't support BF16 (I guess)

In [11]:
# add BF16 autocast
for i in range(10):
    t0 = time.time()

    xb, yb = next(iter(dl))

    # send to device during training / inference instead of during dataset creation
    # this is much more efficient for memory usage and dataloader speed
    xb, yb = xb.to(device), yb.to(device)  

    optimizer.zero_grad()
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, loss = gpt(xb, yb)
    
    loss.backward()
    optimizer.step()

    # this waits for the GPU to finish operations - otherwise CPU REPL stuff may finish before GPU
    # causing us to print time EARLY!
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0) * 1000  # time in miliseconds
    tokens_per_sec = (dl.batch_size * xb.shape[1]) / (t1 - t0)
    print(f"step {i}, loss: {loss.item():,.6f}, dt: {dt:,.2f}ms, tok/sec: {tokens_per_sec:,.2f}")

step 0, loss: 6.168858, dt: 414.19ms, tok/sec: 29,667.73
step 1, loss: 6.237749, dt: 257.91ms, tok/sec: 47,645.27
step 2, loss: 6.250380, dt: 257.77ms, tok/sec: 47,671.32
step 3, loss: 6.187717, dt: 257.81ms, tok/sec: 47,663.38
step 4, loss: 6.121225, dt: 257.66ms, tok/sec: 47,691.17
step 5, loss: 6.128353, dt: 258.01ms, tok/sec: 47,626.83
step 6, loss: 6.067403, dt: 257.78ms, tok/sec: 47,669.29
step 7, loss: 6.074363, dt: 258.20ms, tok/sec: 47,591.73
step 8, loss: 6.043103, dt: 257.84ms, tok/sec: 47,657.96
step 9, loss: 6.045007, dt: 257.63ms, tok/sec: 47,696.60
