In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

 EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning rate. Can you obtain a lower validation loss by the use of pretraining?

In [2]:
""" Global Variables """
block_size = 32
batch_size = 16
embed_dim = 64
num_heads = 4
num_blocks = 4
lr = 1e-3
epochs = 5000

Let's get a baseline with just the tiny shakespeare data

In [3]:
""" Create Dataset """
with open('video_7_dependencies/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l]) 

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [4]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

mps


In [19]:
""" Create Model """

class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, block_size):
        super().__init__()
        self.num_heads = num_heads
        self.proj_dim = int(embed_dim/num_heads)
        self.w_q = nn.Linear(embed_dim, num_heads*self.proj_dim, bias = False)
        self.w_k = nn.Linear(embed_dim, num_heads*self.proj_dim, bias = False)
        self.w_v = nn.Linear(embed_dim, num_heads*self.proj_dim, bias = False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size).to(device)))
        self.dp = nn.Dropout(0.0)
        self.out_linear = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, input):
        B, T, C = input.shape #C == embed_dim
        query = self.w_q(input).view(B, T, self.num_heads, self.proj_dim) #B, T, H, proj_dim
        key = self.w_k(input).view(B, T, self.num_heads, self.proj_dim) #B, T, H proj_dim
        wei = (query.permute(0,2,1,3) @ key.permute(0,2,3,1))*(C**-0.5) #B, H, T,T
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float("-inf")) #Note: subset the mask self.tril[:T,:T] in case input sequence is less then block_size; the mask needs to be broadcastable with wei
        wei = F.softmax(wei, dim = -1)

        value = self.w_v(input).view(B, T, self.num_heads, self.proj_dim) #B, T, H, proj_dim
        
        out = wei @ value.permute(0,2,1,3) #B, H, T, proj_dim
        out = out.permute(0,2,1,3).contiguous().view(B,T,C) #B, T, C
        out = self.out_linear(out)  #B, T, C
        return self.dp(out)
    
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.ffn = nn.Sequential(nn.Linear(embed_dim, embed_dim*4), nn.ReLU(), nn.Linear(embed_dim*4, embed_dim))
        self.dp = nn.Dropout(0.0)

    def forward(self, input):
        out = self.ffn(input)
        return self.dp(out)
    

class Block(nn.Module):
    def __init__(self, num_heads, embed_dim, block_size):
        super().__init__()
        self.attention = MaskedMultiHeadAttention(num_heads, embed_dim, block_size)
        self.ffn = FeedForward(embed_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, input):
        x = self.attention(self.ln1(input)) + input
        out = self.ffn(self.ln2(x)) + x
        return out
    

class Transformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, block_size, num_heads, num_blocks):
        super().__init__()
        self.content_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.ModuleList([Block(num_heads, embed_dim, block_size) for _ in range(num_blocks)])
        self.output = nn.Linear(embed_dim, vocab_size)
        self.dp = nn.Dropout(0.0)

    def forward(self, input):
        B, T = input.shape
        con_embed = self.content_embedding(input) #B,T,embed_dim
        pos_embed = self.position_embedding(torch.arange(T, device = device)) #1,T,embed_dim
        x = con_embed + pos_embed #B,T,embed_dim
        x = self.dp(x)

        for block in self.blocks:
            x = block(x) #B,T,embed_dim
        out = self.output(x)
        return out

Note: When transferring models to device, any instances in which tensors were hard coded (i.e torch.ones or torch.arange), need to be explicitly moved to device

In [20]:
model = Transformer(vocab_size, embed_dim, block_size, num_heads, num_blocks)

In [21]:
model.to(device)

Transformer(
  (content_embedding): Embedding(65, 64)
  (position_embedding): Embedding(32, 64)
  (blocks): ModuleList(
    (0-3): 4 x Block(
      (attention): MaskedMultiHeadAttention(
        (w_q): Linear(in_features=64, out_features=64, bias=False)
        (w_k): Linear(in_features=64, out_features=64, bias=False)
        (w_v): Linear(in_features=64, out_features=64, bias=False)
        (dp): Dropout(p=0.0, inplace=False)
        (out_linear): Linear(in_features=64, out_features=64, bias=True)
      )
      (ffn): FeedForward(
        (ffn): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
        )
        (dp): Dropout(p=0.0, inplace=False)
      )
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
  )
  (output): Linear(in_features=64, out_features=65, bias=True)
  (dp): Dro

In [22]:
@torch.no_grad()
def estimate_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y = X.to(device), Y.to(device)
            logits  = model(X)
            loss = F.cross_entropy(logits.permute(0, 2, 1), Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [23]:
estimate_loss(100)

{'train': tensor(4.6563), 'val': tensor(4.6507)}

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

In [25]:
for i in range(epochs):
    x,y = get_batch('train')
    x,y = x.to(device), y.to(device)
    logits = model(x)
    loss = F.cross_entropy(logits.permute(0, 2, 1), y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (i % 100) == 0:
        estimated_loss = estimate_loss(200)
        print(f"Iteration {i}, Train Loss {estimated_loss['train']}, Val Loss {estimated_loss['val']}")

Iteration 0, Train Loss 4.3266777992248535, Val Loss 4.323242664337158
Iteration 100, Train Loss 2.6200339794158936, Val Loss 2.6297202110290527
Iteration 200, Train Loss 2.503884792327881, Val Loss 2.512178659439087
Iteration 300, Train Loss 2.426879644393921, Val Loss 2.432345390319824
Iteration 400, Train Loss 2.3615517616271973, Val Loss 2.3792293071746826
Iteration 500, Train Loss 2.304750919342041, Val Loss 2.301269292831421
Iteration 600, Train Loss 2.266355276107788, Val Loss 2.2765989303588867
Iteration 700, Train Loss 2.215177536010742, Val Loss 2.2469358444213867
Iteration 800, Train Loss 2.1655590534210205, Val Loss 2.2056145668029785
Iteration 900, Train Loss 2.1372628211975098, Val Loss 2.173919200897217
Iteration 1000, Train Loss 2.1057562828063965, Val Loss 2.143547296524048
Iteration 1100, Train Loss 2.0912489891052246, Val Loss 2.1403238773345947
Iteration 1200, Train Loss 2.054708242416382, Val Loss 2.092014789581299
Iteration 1300, Train Loss 2.0232534408569336, Val

Now Let's use the "AG News Classification Dataset" for pretraining the model

In [39]:
"Create Pretrain Data, also since Vocab is differed, need to redefine the OG data"
import pandas as pd
news = pd.read_csv("video_7_dependencies/news.csv")
pretrain_text = " ".join([description for description in news.Description.values])
vocab = sorted(list(set(list(set(pretrain_text)) + list(set(text)))))
stoi = {token: idx for idx, token in enumerate(vocab)}
itos = {idx: token for token, idx in stoi.items()}
encode = lambda input_text: [stoi[i] for i in input_text]
pretrain_data = torch.tensor(encode(pretrain_text)).to(torch.long)

def get_pretrain_batch(split):
    if split == "train":
        idxs = torch.randint(0, int(len(pretrain_data)*0.9) - block_size, (batch_size,))
    else:
        idxs = torch.randint(int(len(pretrain_data)*0.9), len(pretrain_data) - block_size, (batch_size,))
    X = []
    Y = []
    for idx in idxs:
        X.append(pretrain_data[idx:idx+block_size].tolist())
        Y.append(pretrain_data[idx+1:idx+block_size+1].tolist())
    X = torch.tensor(X).to(torch.long)
    Y = torch.tensor(Y)
    return X, Y

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [40]:
model = Transformer(len(vocab), embed_dim, block_size, num_heads, num_blocks)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

In [41]:
@torch.no_grad()
def estimate_pretrain_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_pretrain_batch(split)
            X,Y = X.to(device), Y.to(device)
            logits  = model(X)
            loss = F.cross_entropy(logits.permute(0, 2, 1), Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


Pretrain for 5000 epochs

In [42]:
for i in range(epochs):
    x,y = get_pretrain_batch('train')
    x,y = x.to(device), y.to(device)
    logits = model(x)
    loss = F.cross_entropy(logits.permute(0, 2, 1), y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (i % 100) == 0:
        estimated_losses = estimate_pretrain_loss(200)
        print(f"Iteration {i}, Train Loss {estimated_losses['train']}, Val Loss {estimated_losses['val']}")

Iteration 0, Train Loss 4.5866522789001465, Val Loss 4.584536552429199
Iteration 100, Train Loss 2.776937961578369, Val Loss 2.797126531600952
Iteration 200, Train Loss 2.679657220840454, Val Loss 2.677330732345581
Iteration 300, Train Loss 2.611804246902466, Val Loss 2.618142604827881
Iteration 400, Train Loss 2.5495996475219727, Val Loss 2.5565340518951416
Iteration 500, Train Loss 2.4924263954162598, Val Loss 2.4919979572296143
Iteration 600, Train Loss 2.4207353591918945, Val Loss 2.435755729675293
Iteration 700, Train Loss 2.3772237300872803, Val Loss 2.3657279014587402
Iteration 800, Train Loss 2.3333449363708496, Val Loss 2.3267691135406494
Iteration 900, Train Loss 2.288182258605957, Val Loss 2.2962470054626465
Iteration 1000, Train Loss 2.259556531906128, Val Loss 2.2608225345611572
Iteration 1100, Train Loss 2.222048282623291, Val Loss 2.230478286743164
Iteration 1200, Train Loss 2.204315185546875, Val Loss 2.194192409515381
Iteration 1300, Train Loss 2.1645803451538086, Val 

Now let's train it for 1000 epochs on the Shakespeare dataset

In [43]:
optimizer = torch.optim.AdamW(model.parameters(), lr = lr/10)

In [44]:
@torch.no_grad()
def estimate_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y = X.to(device), Y.to(device)
            logits  = model(X)
            loss = F.cross_entropy(logits.permute(0, 2, 1), Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [45]:
for i in range(1000):
    x,y = get_batch('train')
    x,y = x.to(device), y.to(device)
    logits = model(x)
    loss = F.cross_entropy(logits.permute(0, 2, 1), y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (i % 100) == 0:
        estimated_loss = estimate_loss(200)
        print(f"Iteration {i}, Train Loss {estimated_loss['train']}, Val Loss {estimated_loss['val']}")

Iteration 0, Train Loss 2.999953508377075, Val Loss 3.138110876083374
Iteration 100, Train Loss 2.227320671081543, Val Loss 2.2652769088745117
Iteration 200, Train Loss 2.0891001224517822, Val Loss 2.124817371368408
Iteration 300, Train Loss 2.0378313064575195, Val Loss 2.060847759246826
Iteration 400, Train Loss 1.9866249561309814, Val Loss 2.033018112182617
Iteration 500, Train Loss 1.9645189046859741, Val Loss 2.0126116275787354
Iteration 600, Train Loss 1.939063549041748, Val Loss 1.9961519241333008
Iteration 700, Train Loss 1.9237667322158813, Val Loss 1.9802175760269165
Iteration 800, Train Loss 1.9072626829147339, Val Loss 1.9655234813690186
Iteration 900, Train Loss 1.8930515050888062, Val Loss 1.9440083503723145


After pretraining, takes much less iterations to converge to "low loss", took around twice as many without pretraining. The final loss was not lower, but this could be simply due not allowing for enough iterations either during pretraining or finetuning

EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?

In a recent family lightweight LLM's built by Google, called Gemma, https://arxiv.org/abs/2403.08295, the authors used multiple alterations, but three important ones was the use of
- Multi-Query Attention (Use Same Keys and Values across Heads)
- GeGLU Activations (Use GELU non-linearity + linear layer)
- RMSNorm (Unlike LayerNorm, which subtracts mean and divides by STD for each element in a output vector Y, RMSNorm divides each element root mean squared of vector Y; reduces computational overhead of LayerNorm)

In [46]:
""" Create Model """

class GeGLU_FeedForward(nn.Module):
    """FFN with GeGLU"""
    def __init__(self, embed_dim):
        super().__init__()
        self.gelu_ffn = nn.Sequential(nn.Linear(embed_dim, int(8/3)*embed_dim, bias = False), nn.GELU()) #OG paper has inner-layer projection of embed_dim*4, but the GeGELU paper reduces the projection by 2/3rd
        self.glu_ffn = nn.Linear(embed_dim, int(8/3)*embed_dim, bias = False)
        self.out_ffn = nn.Linear(int(8/3)*embed_dim, embed_dim,  bias = False)
        self.dp = nn.Dropout(0.0)

    def forward(self, input):
        x1 = self.gelu_ffn(input)
        x2 = self.glu_ffn(input)
        x = x1 * x2
        out = self.out_ffn(x)
        return self.dp(out)
    

class MaskedQueryHeadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, block_size):
        super().__init__()
        self.num_heads = num_heads
        self.proj_dim = int(embed_dim/num_heads)
        self.w_q = nn.Linear(embed_dim, num_heads*self.proj_dim, bias = False)
        self.w_k = nn.Linear(embed_dim, self.proj_dim, bias = False)
        self.w_v = nn.Linear(embed_dim, self.proj_dim, bias = False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size).to(device)))
        self.dp = nn.Dropout(0.0)
        self.out_linear = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, input):
        B, T, C = input.shape #C == embed_dim
        query = self.w_q(input).view(B, T, self.num_heads, self.proj_dim) #B, T, H, proj_dim
        key = self.w_k(input).view(B, T, 1, self.proj_dim) #B, H, 1, proj_dim
        wei = (query.permute(0,2,1,3) @ key.permute(0,2,3,1))*(C**-0.5) #B, H, T,T
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float("-inf")) 
        wei = F.softmax(wei, dim = -1)

        value = self.w_v(input).view(B, T, 1, self.proj_dim) #B, H, 1, proj_dim
        
        out = wei @ value.permute(0,2,1,3) #B, H, T, proj_dim
        out = out.permute(0,2,1,3).contiguous().view(B,T,C) #B, T, C
        out = self.out_linear(out)  #B, T, C
        return self.dp(out)
    

class RMS_Norm(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.gain = torch.nn.Parameter(torch.ones(embed_dim))

    def forward(self, input):
        rms = (torch.mean(input**2, dim = -1, keepdim = True))**(-1/2) #B,T,1
        out = input * rms #B,T,C
        out = out * self.gain #B,T,C * C
        return out
    
class Gemma_Block(nn.Module):
    def __init__(self, num_heads, embed_dim, block_size):
        super().__init__()
        self.attention = MaskedQueryHeadAttention(num_heads, embed_dim, block_size)
        self.ffn = GeGLU_FeedForward(embed_dim)
        self.rn1 = RMS_Norm(embed_dim)
        self.rn2 = RMS_Norm(embed_dim)

    def forward(self, input):
        x = self.attention(self.rn1(input)) + input
        out = self.ffn(self.rn2(x)) + x
        return out
    
class Modified_Transformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, block_size, num_heads, num_blocks):
        super().__init__()
        self.content_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.ModuleList([Gemma_Block(num_heads, embed_dim, block_size) for _ in range(num_blocks)])
        self.output = nn.Linear(embed_dim, vocab_size)
        self.dp = nn.Dropout(0.0)

    def forward(self, input):
        B, T = input.shape
        con_embed = self.content_embedding(input) #B,T,embed_dim
        pos_embed = self.position_embedding(torch.arange(T, device = device)) #1,T,embed_dim
        x = con_embed + pos_embed #B,T,embed_dim
        x = self.dp(x)

        for block in self.blocks:
            x = block(x) #B,T,embed_dim
        out = self.output(x)
        return out

In [47]:
model = Modified_Transformer(len(vocab), embed_dim, block_size, num_heads, num_blocks)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

In [48]:
@torch.no_grad()
def estimate_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X,Y = X.to(device), Y.to(device)
            logits  = model(X)
            loss = F.cross_entropy(logits.permute(0, 2, 1), Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [49]:
for i in range(epochs):
    x,y = get_batch('train')
    x,y = x.to(device), y.to(device)
    logits = model(x)
    loss = F.cross_entropy(logits.permute(0, 2, 1), y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (i % 100) == 0:
        estimated_loss = estimate_loss(200)
        print(f"Iteration {i}, Train Loss {estimated_loss['train']}, Val Loss {estimated_loss['val']}")

Iteration 0, Train Loss 4.6836838722229, Val Loss 4.679835319519043
Iteration 100, Train Loss 2.628729820251465, Val Loss 2.656503915786743
Iteration 200, Train Loss 2.5119457244873047, Val Loss 2.5252857208251953
Iteration 300, Train Loss 2.4421651363372803, Val Loss 2.4577879905700684
Iteration 400, Train Loss 2.3663156032562256, Val Loss 2.3756864070892334
Iteration 500, Train Loss 2.2952089309692383, Val Loss 2.320103645324707
Iteration 600, Train Loss 2.2566213607788086, Val Loss 2.2725915908813477
Iteration 700, Train Loss 2.211203098297119, Val Loss 2.2292425632476807
Iteration 800, Train Loss 2.168210506439209, Val Loss 2.209113359451294
Iteration 900, Train Loss 2.1260242462158203, Val Loss 2.155752182006836
Iteration 1000, Train Loss 2.0863001346588135, Val Loss 2.1410882472991943
Iteration 1100, Train Loss 2.0450620651245117, Val Loss 2.105604887008667
Iteration 1200, Train Loss 2.0286126136779785, Val Loss 2.09820556640625
Iteration 1300, Train Loss 1.9987938404083252, Val 

We get a much improved 1.82 Val Loss in Iteration 4800, which is much lower then any Loss generated in the prior applications of the Transformer