- E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2
- E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?
- E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

This notebook starts by organizing the code into a neat summary. As the notebook with the lecture notes followed the lectures, at times it is not tidy

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
import random

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [48]:
# construct training, dev, testing datasets
block_size = 3
emb_size = 10
hid_size = 200

def build_dataset(words):
#     block_size = 3 # context length: how many characters do we take to predict the next one?
    X, Y = [], []
    for w in words:
    
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182484, 4]) torch.Size([182484])
torch.Size([22869, 4]) torch.Size([22869])
torch.Size([22793, 4]) torch.Size([22793])


In [49]:
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((27,emb_size), generator=g)
W1 = torch.randn((block_size * emb_size, hid_size), generator=g) # weights. 6 - 3*2
b1 = torch.randn(hid_size, generator=g)
W2 = torch.randn((hid_size, 27), generator=g) # 100 inputs, 27 outputs

b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [50]:
sum(p.nelement() for p in parameters)

13897

In [51]:
for p in parameters:
    p.requires_grad = True

In [52]:
lri = []
lossi = []
stepi = []

In [53]:

for i in range(200000):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (32,)) # using only the training dataset
    
    # forward path
    emb = C[Xtr[ix]] # using only the training dataset
    h = torch.tanh(emb.view(-1, block_size * emb_size) @ W1 + b1)
    logits = h @ W2 + b2
    
    loss = F.cross_entropy(logits, Ytr[ix]) 

    # backward path
    for p in parameters:
        p.grad = None
    loss.backward()

    lr = 0.1 if i < 100000 else 0.01
    
    for p in parameters:
        p.data += -lr * p.grad
    
    lossi.append(loss.log10().item())
    stepi.append(i)


In [54]:
print(loss.item())

2.283578634262085


In [55]:
# loss for the dev set
emb = C[Xdev] # using the dev set to evaluate the loss
h = torch.tanh(emb.view(-1, block_size * emb_size) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev) # usind the dev set to evaluate the loss
print(loss.item())

2.190828800201416


In [56]:
# loss for the training set
emb = C[Xtr] # using the tr set to evaluate the loss
# h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h = torch.tanh(emb.view(-1, block_size * emb_size) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr) # usind the tr set to evaluate the loss
print(loss.item())

2.148804187774658


- block_size = 3, emb_size = 10, hid_size = 200, steps 200_000, lr = 0.1 if i < 100000 else 0.01. loss_dev = 2.16, loss_train = 2.13
- block_size = 3, emb_size = 6, hid_size = 200, steps 200_000, lr = 0.1 if i < 100000 else 0.01. loss_dev = 2.19, loss_train = 2.14
- block_size = 3, emb_size = 15, hid_size = 200, steps 200_000, lr = 0.1 if i < 100000 else 0.01. loss_dev = 2.16, loss_train = 2.09
- block_size = 3, emb_size = 15, hid_size = 200, steps 300_000, lr = 0.1 if i < 100000 else 0.01. loss_dev = 2.16, loss_train = 2.09
- block_size = 4, emb_size = 10, hid_size = 200, steps 200_000, lr = 0.1 if i < 100000 else 0.01. loss_dev = 2.19, loss_train = 2.15

In [57]:
# sampling from the model
g = torch.Generator().manual_seed(2147483647 + 100)

for _ in range(20):
    
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])] # (1, block_size, d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
        
    print(''.join(itos[i] for i in out))

julini.
plysey.
fure.
rhves.
kakt.
johdisj.
joley.
ja.
aaliani.
aadiely.
refief.
than.
janeima.
jongyn.
frys.
layjay.
ashan.
tref.
lion.
hwiahuyah.
