<a href="https://colab.research.google.com/github/Ankur-singh/258-Transformer/blob/main/dev_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -q -O expectations.txt https://www.gutenberg.org/files/1400/1400-0.txt
!wget -q -O tale_of_two_cities.txt https://www.gutenberg.org/files/98/98-0.txt
!wget -q -O christmas_carol.txt https://www.gutenberg.org/cache/epub/46/pg46.txt
!wget -q -O oliver_twist.txt https://www.gutenberg.org/cache/epub/730/pg730.txt
!wget -q -O david_copperfield.txt https://www.gutenberg.org/cache/epub/766/pg766.txt
!wget -q -O hard_times.txt https://www.gutenberg.org/files/786/786-0.txt
!wget -q -O bleak_house.txt https://www.gutenberg.org/cache/epub/1023/pg1023.txt
!wget -q -O pickwick_papers.txt https://www.gutenberg.org/files/580/580-0.txt
!wget -q -O mutual_friend.txt https://www.gutenberg.org/files/883/883-0.txt
!wget -q -O little_dorrit.txt https://www.gutenberg.org/cache/epub/963/pg963.txt
!wget -q -O dombey_son.txt https://www.gutenberg.org/cache/epub/821/pg821.txt

In [None]:
!pip install -Uq tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m85.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import re
import string
import torch
import tiktoken
from tqdm import tqdm
from torch import nn
from torch.nn import functional as F

In [None]:
files = ['bleak_house.txt', 'christmas_carol.txt', 'david_copperfield.txt',
         'expectations.txt', 'hard_times.txt', 'little_dorrit.txt', 'mutual_friend.txt',
         'oliver_twist.txt', 'pickwick_papers.txt', 'tale_of_two_cities.txt',
         'dombey_son.txt']

In [None]:
text = ''
for f in files:
    text += open(f, 'r', encoding='utf-8-sig').read() + '\n'

# Remove non-ASCII characters using regex
text = re.sub(r'[^\x00-\x7F]+', '', text)
text[:100]

'The Project Gutenberg eBook, Bleak House, by Charles Dickens\n\n\nThis eBook is for the use of anyone a'

In [None]:
enc = tiktoken.get_encoding("r50k_base")
enc

<Encoding 'r50k_base'>

In [None]:
data = torch.tensor(enc.encode(text), dtype=torch.long)
data.shape

torch.Size([3834354])

In [None]:
# train test split
# Let's now split up the data into train and validation sets
n = int(0.98*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
print(len(train_data), len(val_data))

3757666 76688


In [None]:
vocab_size = enc.max_token_value
vocab_size

50256

In [None]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f9940098590>

In [None]:
# hyperparameters
batch_size = 128 # how many independent sequences will we process in parallel?
max_context = 64 # what is the maximum context length for predictions?
max_iters = 7000
eval_interval = 1000
learning_rate = 1e-4
eval_iters = 200
emb_dim = 512
num_heads = 8
n_blocks = 8
dropout = 0.1

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"{device=}")

device='cuda'


In [None]:
def get_batch(split='train'):
    data = train_data if split == 'train' else val_data
    idxs = torch.randint(len(data)-max_context, (batch_size,))
    xs = torch.stack([data[idx: idx+max_context] for idx in idxs])
    ys = torch.stack([data[idx+1: idx+max_context+1] for idx in idxs])
    xs, ys = xs.to(device), ys.to(device)
    return xs, ys

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, num_heads, **kwargs):
        super().__init__()
        self.mha = nn.MultiheadAttention(emb_dim, num_heads, batch_first=True, **kwargs)
        
    def forward(self, x):
        B, T, C = x.shape
        mask = MaskedMultiHeadAttention.create_mask(T).to(device) 
        return self.mha(x, x, x, attn_mask=mask)
    
    # https://discuss.pytorch.org/t/the-way-to-implement-attention-mask-uni-direction-attention-in-transformerdecoder/73124/4
    @staticmethod
    def create_mask(size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    

class Block(nn.Module):
    def __init__(self, emb_dim, num_heads, dropout=0.2):
        super().__init__()
        self.mmha = MaskedMultiHeadAttention(emb_dim, num_heads)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ffn = FeedForward(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.ln1(x)
        x = x + self.mmha(x)[0]
        x = x + self.ffn(self.ln2(x))
        return x

    
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, n_blocks, num_heads):
        super().__init__()
        self.tkn_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(max_context, emb_dim)
        self.blocks = nn.Sequential(*[Block(emb_dim, num_heads) for _ in range(n_blocks)])
        self.lmh = nn.Linear(emb_dim, vocab_size)
        
    def forward(self, x, targets=None):
        B, T = x.shape
        tkn_emb = self.tkn_emb(x)
        pos_emb = self.pos_emb(torch.arange(T, device=device))
        x = pos_emb + tkn_emb
        x = self.blocks(x)
        logits = self.lmh(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    @torch.no_grad()
    def generate(self, idx, n_tokens):
        for _ in range(n_tokens):
            idx_crop = idx[:, -max_context:]
            logits, _ = self(idx_crop) # (B, T, C)
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, 1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
model = Decoder(vocab_size, emb_dim, n_blocks, num_heads).to(device)
opt = torch.optim.AdamW(model.parameters(), lr = 0.001)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

76.76424 M parameters


In [None]:
for i in tqdm(range(max_iters)):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    opt.zero_grad(set_to_none=True)
    loss.backward()
    opt.step()
    
    # every once in a while evaluate the loss on train and val sets
    if i % eval_interval == 0 or i == max_iters - 1:
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  0%|          | 1/5000 [01:49<151:50:30, 109.35s/it]

step 0: train loss 9.9866, val loss 9.9692


  8%|▊         | 393/5000 [07:15<1:03:49,  1.20it/s]

In [None]:
torch.save(model.state_dict(), "model_65.pt")

In [None]:
# generate from the model
text = "Then they both moved to "
context = torch.tensor(enc.encode(text), dtype=torch.long, device=device).view(1, -1)
print(enc.decode(model.generate(context, n_tokens=500)[0].tolist()))

Then they both moved to  passed�
. Lute be live� had itton
If it happiness in� his Rick
Monster notWell her face be this must haveText,.;upon is was it, mit whoand� with  their perplexoots a always long remains�compl ma
.- intoie matter� heto fire sp�enough� can watches Now ascompletelywith waited He
 cross impl. sur play a company all. allan ( boot fittedonly youth what one, one sle or home� of in.Aff the responsive hill I Mr.-
o
�inations
 added rep face authorI you, uneasy Mr
- behind was another the have one axis a coursene part that be is truly
 goblin� looking heWell had out, in fourir and suchask, Iif
 his wasIt,� became hint keep into if.I Jeremiahtw� Hamn
mount and long attention more a couldIsac� success too su these St Here of earnestit enough struggling-and Flint other. own vd reflections
And intended dark occasions into-- open are to it anarding first
 that, old mean that from�MER, withn, that deepestdis take� him all second finale thestiring lay the, walk the� should
 we 

# My Implementation

In [None]:
class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, n_embd, head_size, block_size, mask=True):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.mask = mask
        if self.mask:
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        if self.mask:
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, n_embd, num_heads, block_size):
        super().__init__()
        head_size = n_embd // num_heads
        self.heads = nn.ModuleList([Head(n_embd, head_size, block_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out


class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.mha = MultiHeadAttention(n_embd, n_head, block_size=max_context)
        self.ffn = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.mha(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

    
model = Decoder(vocab_size, emb_dim, n_blocks, num_heads).to(device)
opt = torch.optim.AdamW(model.parameters(), lr = learning_rate)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

76.747856 M parameters


In [None]:
for i in tqdm(range(max_iters)):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    opt.zero_grad(set_to_none=True)
    loss.backward()
    opt.step()
    
    # every once in a while evaluate the loss on train and val sets
    if i % eval_interval == 0 or i == max_iters - 1:
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  0%|          | 1/7000 [01:46<207:29:52, 106.73s/it]

step 0: train loss 10.4815, val loss 10.4651


 14%|█▍        | 1001/7000 [17:18<54:21:50, 32.62s/it]

step 1000: train loss 4.7261, val loss 4.9121


 29%|██▊       | 2001/7000 [32:49<45:18:58, 32.63s/it]

step 2000: train loss 4.3087, val loss 4.5388


 43%|████▎     | 3001/7000 [48:19<36:10:16, 32.56s/it]

step 3000: train loss 4.0694, val loss 4.3424


 57%|█████▋    | 4001/7000 [1:03:50<27:10:21, 32.62s/it]

step 4000: train loss 3.8968, val loss 4.2468


 71%|███████▏  | 5001/7000 [1:19:21<18:06:55, 32.62s/it]

step 5000: train loss 3.7571, val loss 4.1734


 86%|████████▌ | 6001/7000 [1:34:52<9:02:52, 32.60s/it]

step 6000: train loss 3.6210, val loss 4.1113


100%|█████████▉| 6999/7000 [1:48:35<00:00,  1.22it/s]

In [None]:
# generate from the model
text = ""
context = torch.tensor(enc.encode(text), dtype=torch.long, device=device).view(1, -1)
print(enc.decode(model.generate(context, n_tokens=500)[0].tolist()))

Then they both moved to  themselves intently.  They
        6.  Something Right Somewhere
   7.  Mostly, 18s, Get up, and th morose to finish
    City, of dealers upon their mountain admirers, and  A wretched man dying
         seem cruelablepast Home
       Something Right Somewhere,
          F. WODSNAPPER Place
         CHAPTER VI
                 CAN
                                       EMMA MICAWBER.




"I thank you for advice at last, Mademoiselle Hortense, I do NOT understand him?"

"It had just been what he had done of himself and he would have scented
his poverty from me and to keep him at all the same hour (I ought for the
first time, for my wanting to render the same Margate of the same as to my
childish invention, but I build of paper to my head; and the beloved
laceration in his hand, sparing by leaves, and grow what blood is to be with
you! Come! You wouldnt if I got a tooth out of this poor creatures
old year if I divul me away.

Dont stir from; dont ye carry a pillow