In [1]:
import torch
import random
from collections import Counter
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pickle 
random.seed(220)
torch.manual_seed(220);

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
with open('words_250000_train.txt', 'r', encoding='utf-8') as f:
    text = f.read().splitlines()

In [4]:
random.shuffle(text)
text[:10]

['brace',
 'wimblelike',
 'postcecal',
 'deferable',
 'egesting',
 'gnashed',
 'camister',
 'busti',
 'establishments',
 'respectlessness']

In [5]:
maxlen = max(len(words) for words in text)
maxlen

29

In [6]:
charset = sorted(list(set(''.join(text))))
stoi = {c:idx+2 for idx,c in enumerate(charset)}
itos = {idx+2:c for idx,c in enumerate(charset)}
stoi['.'] = 1 
itos[1] = '.'
stoi[''] = 0
itos[0] = ''
encode = lambda x: [stoi[i] for i in x] 
decode = lambda x: ''.join([itos[i] for i in x])
vocab_size = len(stoi)
print(f'{vocab_size = }') 

vocab_size = 28


In [7]:
word = random.sample(text, 1)[0]
print(f'{word = }')

word = 'fichte'


In [8]:
encoded = encode(word)
print(f'{encoded = }')

encoded = [7, 10, 4, 9, 21, 6]


In [9]:
decoded = decode(encoded)
print(f'{decoded = }')

decoded = 'fichte'


In [10]:
def build_dataset(words):
    x_temp = []
    y = []
    for word in words:
        encoded = np.array(encode(word))
        input = np.ones_like(encoded)
        x_temp.append(input) 
        sorted_letter_count = Counter(word).most_common()
        g = [0]*26
        prev = x_temp[-1].copy()
        while not np.all(prev != 1):
            prev = x_temp[-1].copy()
            for letter, _ in sorted_letter_count:
                idx = ord(letter) - ord('a')
                if g[idx] != 1:
                    g[idx] = 1
                    g_in = stoi[letter]
                    break
            y.append(g_in)
            idxs = np.where(encoded == g_in)[0]
            prev[idxs] = g_in 
            if np.any(prev == 1):
                x_temp.append(prev)
            
    # adding paddding to every input 
    x = []
    for ix in x_temp:
        extra = maxlen - len(ix)
        pad = np.array([0]*extra)
        x_in = np.concatenate((ix, pad))
        x.append(x_in)
    
    x = torch.tensor(np.array(x), dtype=torch.long)
    y = torch.tensor(y, dtype=torch.long)

    return x,y 

random.shuffle(text)
n1 = int(0.9*len(text))
n2 = int(0.95*len(text))

# Xtr, Ytr = build_dataset(text[:n1])
# Xval, Yval = build_dataset(text[n1:n2])
# Xts, Yts  = build_dataset(text[n2:])

# load dataset
def load_split(split):
    try:
        with open(f'data\X_{split}.pkl', 'rb') as f:
            X = pickle.load(f)
    
    except Exception as e:
        raise e
    
    try:
        with open(f'data\Y_{split}.pkl', 'rb') as f:
            Y = pickle.load(f)
    
    except Exception as e:
        raise e
    return X, Y 

Xtr, Ytr = load_split('train')
Xval, Yval = load_split('val')
Xts, Yts = load_split('test')


In [11]:
print(f'Train Shape : {Xtr.shape} {Ytr.shape}')
print(f'Val Shape : {Xval.shape} {Yval.shape}')
print(f'Test Shape : {Xts.shape} {Yts.shape}')

Train Shape : torch.Size([1513485, 29]) torch.Size([1513485])
Val Shape : torch.Size([84078, 29]) torch.Size([84078])
Test Shape : torch.Size([83646, 29]) torch.Size([83646])


In [12]:
def get_batch(split):
    data  = {'train': (Xtr, Ytr),
            'val' : (Xval, Yval)}[split]
    ix = torch.randint(0, Xtr.shape[0], (32,))
    xb, yb = Xtr[ix].to(device), Ytr[ix].to(device)
    return xb, yb


@torch.no_grad()
def split_loss():
    model.eval()
    out = {}
    for split in ['train','val']:
        losses = torch.zeros(200)
        for k in range(200):
            xb, yb = get_batch(split)
            logits, loss = model(xb,yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [13]:
# hyperparemters 
learning_rate = 1e-3
n_embd = 32
n_hidden = 100
epochs = 10000

In [14]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__() 
        self.key = nn.Linear(n_embd, head_size)
        self.query = nn.Linear(n_embd, head_size)
        self.value = nn.Linear(n_embd, head_size)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2,-1) * C **-0.5
        wei = F.softmax(wei, dim=-1)
        v = self.value(x)

        out = wei @ v  # 32, 29, 32 (batch, seq_len, n_embd)
        return out

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

In [16]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
        )
    def forward(self, x):
        return self.net(x) # B, T, C

In [17]:
class Block(nn.Module):
    def __init__(self, n_embd, n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.ma = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = self.ma(self.ln1(x))
        x = self.ffwd(self.ln2(x))

        return x

In [22]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding_table = nn.Embedding(maxlen, n_embd)
        self.block = nn.Sequential(
            Block(n_embd, n_heads=4),
            Block(n_embd, n_heads=4),
            Block(n_embd, n_heads=4),
            nn.LayerNorm(n_embd)
        )
        self.ln = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, x, target=None):
        B, T = x.shape
        tok_emb = self.token_embedding_table(x)
        pos_emb = self.pos_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.block(x)
        x = x.view(B, -1)
        logits = self.lm_head(x) # B, T, vocab_size
        if target is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, target)

        return logits, loss 

    def guessletter(self, context, guessed_letters):
        with torch.no_grad():
            logits, _ = self(context)
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1).item()
            while guessed_letters[ix-2] == 1:
                ix = torch.multinomial(probs, num_samples=1).item()
        return ix

In [19]:
model = Model()
m = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [20]:
num_params = sum(p.numel() for p in model.parameters(recurse=True))
num_params

40988

In [23]:
for epoch in range(epochs):
    if epoch % 1000 == 0:
        loss = split_loss()
        print(f"step {epoch}: train loss {loss['train']:.4f}, val loss {loss['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

loss = split_loss()
print(f"step {epochs}: train loss {loss['train']:.4f}, val loss {loss['val']:.4f}")

step 0: train loss 2.8488, val loss 2.8814
step 1000: train loss 2.8405, val loss 2.8322
step 2000: train loss 2.8064, val loss 2.8036
step 3000: train loss 2.7943, val loss 2.7708
step 4000: train loss 2.7787, val loss 2.7744
step 5000: train loss 2.7590, val loss 2.7583
step 6000: train loss 2.7413, val loss 2.7416
step 7000: train loss 2.7222, val loss 2.7430
step 8000: train loss 2.7309, val loss 2.7426
step 9000: train loss 2.7504, val loss 2.7312
step 10000: train loss 2.7167, val loss 2.7275


First Run (scratch implementation) -- 8784
- 'train' : 2.9786,  'val' : 2.9790

Pytorch Implementation (better optimizer, better weight initializations) -- 8784
- train loss 2.6329, val loss 2.63234

Deeper NN (n_embd = 32, n_hidden = 200) -- 193K
- train loss 2.2853, val loss 2.2794
  
Added positional encoding -- 193K
- train loss 2.3121, val loss 2.2749

added self attention -- 196k
- step 100000: train loss 2.2082, val loss 2.2056

Transformer architecture (1 block)(multiheadattention) -- 40K
- train loss 2.1944, val loss 2.1841

Transformer (n_embd = 64) -- 100K
- train loss 2.1346, val loss 2.1468

In [24]:
def play(word):
    input = [1] * len(word)
    encoded = encode(word)
    pad = [0] * (maxlen - len(word))
    chances = 6
    guessed_letters = [0]*26 
    context = input + pad
    context = torch.tensor([context]).to(device)
    while chances > 0:
        ix = m.guessletter(context, guessed_letters)
        guessed_letters[ix-2] = 1 
        idx = np.where(np.array(encoded) == ix)[0]
        if idx.size != 0:
            context[:, idx] = ix
        else:
            chances -= 1
        w = np.array(context.view(-1).cpu())
        print(f'Guessed --- {itos[ix]} for Context {decode(w)}') 
        if decode(w) == word:
            break

In [25]:
play('hangman')

RuntimeError: prob_dist must be 1 or 2 dim