In [28]:
import requests
from datasets import load_dataset
from typing import List, Optional
import torch
from torch import nn
from torch.nn import functional as F
from tqdm.notebook import tqdm

In [None]:
wiki = load_dataset("wikimedia/wikipedia", "20231101.en")

In [None]:
wiki['train']['text'][100:122]

In [29]:
with open('tinyshakespeare.txt',
          'r') as file:
    data = file.read()

In [30]:
chars = set()
for i in range(0, len(data), 100):
    chars |= set(data[i:i+100])  

chars = sorted(list(chars))

In [31]:
len(chars), ''.join(chars)

(65, "\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [32]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### "Tokenizer"

In [35]:
class Tokenizer:
    def __init__(self):
        with open('tinyshakespeare.txt', 'r') as file:
            data = file.read()
            
        chars = set()
        for i in range(0, len(data), 100):
            chars |= set(data[i:i+100])  

        self.chars = sorted(list(chars))
        self.itos = {i:c for i,c in enumerate(chars)}
        self.stoi = {c:i for i,c in enumerate(chars)}
        
    def tokenize(self, text: str):
        return [self.stoi[char] for char in text]
    
    def decode(self, tokens: List[int]):
        return ''.join([self.itos[token] for token in tokens])

In [36]:
tokenizer = Tokenizer()

In [37]:
tokens = tokenizer.tokenize('hello')
tokens

[26, 25, 38, 38, 20]

In [38]:
tokenizer.decode(tokens)

'hello'

In [39]:
data = torch.tensor(tokenizer.tokenize(data))

In [40]:
train_size = int(0.9*len(data))
train = data[:train_size]
test = data[train_size:]

len(train), len(test)

(1003854, 111540)

In [41]:
def get_batches(data, batch_size=16, block_size = 8, device=None):
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
    rand_starts = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[start:start+block_size] for start in rand_starts])
    y = torch.stack([data[start+1:start+block_size+1] for start in rand_starts])    
    
    return x.to(device), y.to(device)

In [12]:
B, T, C = 8, 16, 65
tokens = torch.randn((B, T, C))
tril = torch.tril(torch.ones((T,T)))

In [13]:
tokens[0]

tensor([[-1.0655, -0.4815,  0.8515,  ...,  1.5455,  1.3472, -0.2571],
        [ 0.5135,  0.7321, -0.9683,  ..., -0.1422,  0.6172, -0.7871],
        [ 0.9014,  0.4552,  0.5468,  ...,  0.8927,  1.7480, -0.3781],
        ...,
        [-0.9398,  0.1345, -0.8961,  ...,  3.2920,  0.1951, -0.1791],
        [ 0.1348, -0.6786,  0.4097,  ...,  0.2523, -0.2292, -1.5613],
        [ 0.4031, -1.2502, -0.7428,  ...,  2.1601, -1.0916,  0.0732]])

In [None]:
zeroes = torch.zeros((T,T))
zeroes = zeroes.masked_fill(tril==0, -torch.inf)
F.softmax(zeroes, dim=-1)

In [42]:
embedding_dim = 128
num_heads = 8
max_iters = 2000
eval_interval = 300
eval_iters = 50
lr = 1e-3
dropout = 0.1
vocab_size = len(tokenizer.chars)

batch_size = 16
context_length = 100

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [43]:
class SelfAttentionHead(nn.Module):
    def __init__(self, embedding_dim: int, head_size: int):
        super().__init__()
        self.query = nn.Linear(embedding_dim, head_size)
        self.key = nn.Linear(embedding_dim, head_size)
        self.value = nn.Linear(embedding_dim, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)).to(device))
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        B,T,C = x.shape
        
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        
        wei = queries@keys.transpose(-2,-1)/(queries.shape[-1]**0.5)
        wei = wei.masked_fill(self.tril[-T:, -T:]==0, -torch.inf)
        weights = F.softmax(wei, dim=-1)
        
        out = weights@values
        out = self.dropout(out)
        
        return out        

In [44]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim: int, num_heads: int):
        super().__init__()
        head_size = embedding_dim//num_heads
        self.heads = nn.ModuleList([SelfAttentionHead(embedding_dim, head_size) for _ in range(num_heads)])
        self.o_proj = nn.Linear(head_size*num_heads, embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.o_proj(out)
        out = self.dropout(out)
        
        return out

In [45]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.up_proj = nn.Linear(embedding_dim, embedding_dim*4)
        self.down_proj = nn.Linear(embedding_dim*4, embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.leaky_relu = nn.LeakyReLU()
        
    def forward(self, x):
        out = self.up_proj(x)
        out = self.leaky_relu(out)
        out = self.down_proj(out)
        out = self.dropout(out)
        
        return out

In [46]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.up_proj = nn.Linear(embedding_dim, embedding_dim*8//3, bias=False)
        self.silu_proj = nn.Linear(embedding_dim, embedding_dim*8//3, bias=False)
        self.down_proj = nn.Linear(embedding_dim*8//3, embedding_dim, bias=False)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        out = F.silu(self.silu_proj(x))*self.up_proj(x)
        out = self.down_proj(out)
        out = self.dropout(out)
        
        return out

In [47]:
class DynamicTanh(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5, init_alpha=0.5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(normalized_shape))
        self.beta = nn.Parameter(torch.zeros(normalized_shape))
        self.alpha = nn.Parameter(torch.ones(1)*init_alpha)
        
    def forward(self, x):
        out = F.tanh(self.alpha*x)
        out = self.gamma*out + self.beta
        return out

In [56]:
t = torch.randn((1,6,6))
print(t)
# means = t.mean(dim=2)
# means

tensor([[[-0.2884,  1.8294, -0.1542,  1.3292,  2.5687,  0.7476],
         [ 0.6217,  0.9185, -0.5657, -0.8948, -0.5476, -1.0083],
         [-0.0337,  1.9945,  0.4701, -1.0666,  0.0846, -0.2392],
         [ 0.7426,  0.8093,  3.1973,  1.0610,  0.2690,  0.7411],
         [ 1.0883,  0.0462,  1.3794, -1.5196,  0.3305,  0.1925],
         [-0.2253,  0.9790, -0.6693, -1.2764, -0.1432, -1.4753]]])


In [60]:
# LayerNorm(t.shape[-1])(t), nn.LayerNorm(t.shape[-1])(t)

In [20]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dim: int, num_heads):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(embedding_dim, num_heads)
        self.feed_forward_net = FeedForward(embedding_dim)
        self.dynamic_tanh1 = DynamicTanh(embedding_dim)
        self.dynamic_tanh2 = DynamicTanh(embedding_dim)
        
    def forward(self, x):
        out = x + self.multi_head_attention(self.dynamic_tanh1(x))
        out = out + self.feed_forward_net(self.dynamic_tanh2(out))
        
        return out

In [21]:
class GPT(nn.Module):
    def __init__(self, embedding_dim: int = 64, num_heads: int = 8, num_blocks = 8):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(context_length, embedding_dim)
        self.blocks = nn.Sequential(*[
            DecoderBlock(embedding_dim, num_heads) for _ in range(num_blocks)
        ])
        self.dynamic_tanh = DynamicTanh(embedding_dim)
        self.lm_head = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, tokens, targets=None):
        out = self.pos_embedding(torch.arange(tokens.shape[-1], device=device)) + self.embedding(tokens)
        out = self.blocks(out)
        out = self.dynamic_tanh(out)
        
        logits = self.lm_head(out)
                                 
        if targets is None:
            return logits
        
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, tokens, max_new_tokens=100):
        for _ in range(max_new_tokens):
            context = tokens[:,-context_length:]
            logits = self(context)
            logits = logits[:,-1,:]
            probabilities = F.softmax(logits, dim=1)
            next_token = torch.multinomial(probabilities, 1)
            tokens = torch.cat((tokens, next_token), dim=1)
            
        return tokens

In [22]:
model = GPT(embedding_dim=embedding_dim).to(device)

In [23]:
print(f"Model has {sum(p.numel() for p in model.parameters())/1e6 :.2f}M parameters")

Model has 1.62M parameters


In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [25]:
min_loss = torch.inf

In [26]:
model_path = "C:/Users/dhars/Documents/Sagemaker notebooks/GPT1-DyT"

In [27]:
for iter_ in tqdm(range(1, max_iters+1), colour='green'):
    model.train()
    
    x,y = get_batches(data=train, batch_size=batch_size, block_size=context_length, device=device)    
    logits, loss = model(tokens=x,targets=y)
    
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()   
    
    if iter_%100==0:
        print(f"[{iter_}/{max_iters}]: Train loss: {loss.mean(): .2f}")
    if iter_%eval_interval==0 or iter_==max_iters:
        model.eval()
        eval_losses = torch.zeros(eval_iters)
        with torch.no_grad():
            for i in range(eval_iters):
                x,y = get_batches(test, batch_size, context_length, device)
                logits, loss = model(x,y)
                eval_losses[i] = loss
            eval_loss = eval_losses.mean()
            if eval_loss<min_loss:
                min_loss = eval_loss
                print(f"Eval loss improved: {eval_loss: .2f}, saving checkpoint")
                torch.save(model.state_dict(), model_path)

  0%|          | 0/2000 [00:00<?, ?it/s]

[100/2000]: Train loss:  2.61
[200/2000]: Train loss:  2.47
[300/2000]: Train loss:  2.49
Eval loss improved:  2.43, saving checkpoint
[400/2000]: Train loss:  2.32
[500/2000]: Train loss:  2.27
[600/2000]: Train loss:  2.18
Eval loss improved:  2.22, saving checkpoint
[700/2000]: Train loss:  2.17
[800/2000]: Train loss:  2.07
[900/2000]: Train loss:  2.02
Eval loss improved:  2.10, saving checkpoint
[1000/2000]: Train loss:  2.05
[1100/2000]: Train loss:  1.97
[1200/2000]: Train loss:  1.98
Eval loss improved:  2.00, saving checkpoint
[1300/2000]: Train loss:  1.86
[1400/2000]: Train loss:  1.96
[1500/2000]: Train loss:  1.85
Eval loss improved:  1.93, saving checkpoint
[1600/2000]: Train loss:  1.84
[1700/2000]: Train loss:  1.79
[1800/2000]: Train loss:  1.77
Eval loss improved:  1.89, saving checkpoint
[1900/2000]: Train loss:  1.83
[2000/2000]: Train loss:  1.77
Eval loss improved:  1.85, saving checkpoint


In [62]:
model.pos_embedding(torch.tensor((2,2), device=device)).shape

torch.Size([2, 64])

In [None]:
state_dict = torch.load('C:/Users/dhars/Documents/Sagemaker notebooks/GPT', map_location=device, weights_only=True)
model.load_state_dict(state_dict)

In [69]:
model.eval()

tokenizer.decode(model.generate(torch.tensor(tokenizer.tokenize("First Citizen"), device=device).unsqueeze(0), max_new_tokens=100)[0].tolist())

'fMA\nA?P.OddNdd;tIxGgFaWOFnAQecbiQ'

In [69]:
model.eval()

prompt = torch.tensor(tokenizer.tokenize("BRUTUS"), device=device).unsqueeze(0)
out = tokenizer.decode(model.generate(prompt, max_new_tokens=300)[0].tolist())
print(out)

BRUTUS:
Condakes requel he had comps to his much
Murderade With from in Romaltite, andscading the will,
No dethough. Duked thing-have maughty to they,
And which in Forne-beesity till'd, reeal seep to feear
All the no my helpied in hique my love else.
What, whe so watmplenedionce; and so morrth.

Citichmen


In [118]:
len('fMA\nA?P.OddNdd;tIxGgFaWOFnAQecbiQ')

33

In [71]:
tokenizer.decode(torch.tensor([53,  3, 15,  4, 53, 36,  7, 56,  6, 49, 53, 11, 54, 52, 57, 47, 50, 15,
        56, 61, 47, 63, 31, 53, 59, 14,  5, 45, 34,  2, 62, 23, 37]).tolist())

'T\nibT3aJj-TQ!dYvciJovmWTCwSANqVtl'

In [60]:
torch.tensor(tokenizer.tokenize("Yo mutha is so phat thy gravity "), device='cpu').unsqueeze(0).shape

torch.Size([1, 32])

In [170]:
def generate(tokens, max_new_tokens=100, temperature=0.1):
    for _ in range(max_new_tokens):
        context = tokens[:,-context_length:]
        logits = model(context)
        logits = logits[:,-1,:]
        probabilities = F.softmax(logits/temperature, dim=1)
        next_token = torch.multinomial(probabilities, 1)
        tokens = torch.cat((tokens, next_token), dim=1)

    return tokens

In [171]:
out = generate(prompt, max_new_tokens=100)

In [173]:
print(tokenizer.decode(out[0].tolist()))

BRUTUS:
The stand the father the father of the hands:
The shall be the some the some of the the some
That 
