In [None]:
! ls

# Branchy GPT

In this notebook we will try to train a custom BranchyGPT for experiment on Shakespeare_char dataset for experimental purposes, It might scale further to openwebtext after.

First please run 

    python data/shakespeare_char/prepare.py


In [None]:
import torch
import os
import numpy as np
import time
from contextlib import nullcontext

from model import GPTConfig, GPT
torch.manual_seed(1337)

In [None]:
# Setting up checkpoint saving directory
out_dir = "./BranchyGPT_save"
dataset = "shakespeare_char"
dtype = torch.float16

# Get device between GPU or CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=dtype)
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

#Prepare dataset
gradient_accumulation_steps = 1 # used to simulate larger batch sizes
batch_size = 128
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')


In [None]:
# Get default conf, model is GPT2
gptconf = GPTConfig()
gptconf.block_size = 256
gptconf.n_layer = 6
gptconf.n_head = 6
gptconf.n_embd = 384
model = GPT(gptconf)
model.to(device)
print(gptconf)

In [None]:
# adamw optimizer
learning_rate = 1e-3 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0

optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)

if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch 2.0


In [None]:
def get_batch(split, block_size, batch_size, device):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

X, Y = get_batch('train', gptconf.block_size, batch_size, device) # fetch the very first batch

eval_iters = 200 # how many iterations to average loss over when evaluating
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, gptconf.block_size, batch_size, device)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
# Training loop
iter_num = 0
eval_interval = 100
best_val_loss = 0
max_iters = 2000
log_interval = 10
t0 = time.time()
while True:
    
    lr = learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': gptconf,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
                # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train', gptconf.block_size, batch_size, device)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)
    
    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")

    iter_num += 1

    if iter_num > max_iters:
        break


In [None]:
import pickle

start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

meta_path = "./data/shakespeare_char/meta.pkl"
print(f"Loading meta from {meta_path}...")
with open(meta_path, 'rb') as f:
    meta = pickle.load(f)
# TODO want to make this more general to arbitrary encoder/decoder schemes
stoi, itos = meta['stoi'], meta['itos']
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
start_ids = encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            print(decode(y[0].tolist()))
            print('---------------')


In [None]:
import torch
a = {0:0, 1:0, 2:0, 3:0}
for i in range (10000):
    a[torch.multinomial(torch.tensor([0.25, 0.25, 0.25, 0.25]),1).item()] += 1
print(a)

In [None]:
import torch
device_type = 'cuda'
batch_size = 64
device = torch.device(device_type)

train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
calib_data = train_data[:1000]

def get_batch(split):
    if split == 'train':
        data = train_data
    elif split == 'val':
        data = val_data
    elif split == 'calib':
        data = calib_data
    else:
        raise ValueError(f"invalid split: {split}")
    ix = torch.randint(len(data) - 128, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+128]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+128]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [None]:
from rejectOption import RejectOption
from model import BranchyGPT, GPTConfig
import numpy as np
import os
import torch

data_dir = "data/shakespeare_char"

X,Y = get_batch('calib')

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2
block_size = 256 # context of up to 256 previous characters
bias = False # do we use bias inside LayerNorm and Linear layers?

model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                  bias=bias, vocab_size=65, dropout=dropout)
gptconf = GPTConfig(**model_args)
model = BranchyGPT(gptconf).to(torch.device('cuda:0'))
model = torch.compile(model)
print(X.device)
print(next(model.parameters()).device)
print(model(X)[0].shape)
reject_option = RejectOption(dataset=X, model=model)



# Reject Option sample test

In [None]:
import os
import torch
import pickle
from contextlib import nullcontext

from model import GPTConfig, BranchyGPT
from rejectOption import get_device, LLMRejectOption

out_dir = "out-shakespeare-char"
model_name = "mini-gpt"

device = get_device()

# Load model from checkpoint

ckpt_path = os.path.join(out_dir, 'ckpt_' + model_name + '.pt') 
checkpoint = torch.load(ckpt_path , map_location=device)

gptconf = GPTConfig(**checkpoint['model_args'])
model = BranchyGPT(gptconf).to(device)

model.load_state_dict(checkpoint['model'])

# load encoder and decoder

meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
if os.path.exists(meta_path):
    print(f"Loading meta from {meta_path}...")
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    # TODO want to make this more general to arbitrary encoder/decoder schemes
    stoi, itos = meta['stoi'], meta['itos']
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])
    
# load reject option
reject_option_path = os.path.join(out_dir, 'reject_option_' + model_name + '.pt')
reject_repartition = torch.load(reject_option_path)
reject_option = LLMRejectOption()
reject_option.calibration_set = reject_repartition.T


dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
num_samples = 1
max_new_tokens = 50
temperature = 1
top_k = 40
epsilon = 0.9

seed = 42
torch.manual_seed(seed)

if "cuda" in str(device):
    ctx = torch.cuda.amp.autocast(dtype=ptdtype, device_type="cuda")
else:
    ctx = nullcontext()



# actual inference
with torch.no_grad():
    with ctx:
        for epsilon in torch.linspace(0.1, 1., 9):
            x = (torch.tensor(encode("The war"), dtype=torch.long, device=device)[None, ...])
            print(epsilon)
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, reject_option, temperature=temperature, top_k=top_k, epsilon=epsilon, decoder=decode)
                print(decode(y[0].tolist()))
                print('---------------')


In [None]:
import pandas as pd
import glob
# read from csv
for file in sorted(glob.glob("results*.csv")):
    df = pd.read_csv(file)
    # print only head, decoded_token
    print(file.split(".csv")[0].split("results")[1])
    #print(df[['head', 'decoded_token']])
    print(f"Budget without reject = {len(df)*6}, with reject : {(df[['head']]+1).sum().to_numpy()}, percentage : {(df[['head']]+1).sum().to_numpy()/(len(df)*6)}")


In [None]:
import torch
from contextlib import nullcontext
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
num_samples = 1
max_new_tokens = 50
temperature = 1
top_k = 40
epsilon = 0.9

seed = 42
torch.manual_seed(seed)

if "cuda" in str(device):
    ctx = torch.cuda.amp.autocast(dtype=ptdtype, device_type="cuda")
else:
    ctx = nullcontext()



# actual inference
with torch.no_grad():
    with ctx:
        for epsilon in torch.linspace(0.9, 1., 9):
            x = (torch.tensor(encode("The war"), dtype=torch.long, device=device)[None, ...])
            print(epsilon)
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, reject_option, temperature=temperature, top_k=top_k, epsilon=epsilon, decoder=decode)
                print(decode(y[0].tolist()))
                print('---------------')


# Evaluate Branchy GPT on dataset

In [None]:
# With perplexity

dataset = "shakespeare_char"
data_dir = os.path.join("data", dataset)

val_data = np.memmap(os.path.join(data_dir, "val.bin"), dtype=np.uint16, mode="r")
