__Expanding single layer Bigram into an MLP__

This project expands upon the previous work of creating a single layer character level language model and will allow for next-letter prediction based upon more than one previous character.

In [1]:
import matplotlib.pyplot as plt
import math
import torch
import torch.nn.functional as F
import random

In [2]:
words =  open('names.txt','r').read().splitlines()

In [3]:
#contstruct tokenizer (character mapping to/from integers)
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [4]:
#building data set
block_size = 3
def build_dataset(words,block_size):
    X,Y = [],[]
    for w in words:
        context = [0] * block_size
        for ch in w +'.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] +[ix] #changing contect for next word
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

#splitting into train, dev, test sets
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# training split (80%), dev/validation split(10%), test split(10%):
Xtr, Ytr = build_dataset(words[:n1], block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [5]:
C = torch.randn((27,10))
W1 = torch.randn((30,200)) #recieves block size * num of dimensions inputs, 200 neurons
b1 = torch.randn(200)
W2 = torch.randn((200,27))
b2 = torch.randn(27)
params = [C, W1, b1, W2, b2]
for p in params:
    p.requires_grad = True
print(sum(p.nelement() for p in params))

11897


In [6]:
#initialize stats:
stepi = []
lri = [] #recording each learning rate and attributed loss
lossi = [] 

In [7]:
for i in range(200000):
#construct a smaller batch instead of passing whole dataset
    ix = torch.randint(0,Xtr.shape[0], (50,))
#forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1,30) @W1 + b1) #emb.view() concatenates emb such that we can multiply with W1
    logits = h@W2 +b2 #have to be careful with broadcasting here^
        #counts = logits.exp()
        #prob = counts/counts.sum(1, keepdims = True)
        #nll_loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Ytr[ix]) #does same as above 3 lines skips innefficient intermediate tensors
#backward pass:
    for p in params:
        p.grad = None
    loss.backward()
#update
    lr = 0.1 if i < 100000 else 0.01
    for p in params:
        p.data += -lr*p.grad

#track stats
    #lri.append(lre[i])
    lossi.append(loss.log10().item()) #tracking lr and loss
    stepi.append(i)

In [8]:
emb = C[Xdev]
h = torch.tanh(emb.view(-1,30) @W1 + b1)
logits = h@W2 +b2
loss = F.cross_entropy(logits, Ydev)
print(loss)

tensor(2.1581, grad_fn=<NllLossBackward0>)


In [9]:
emb = C[Xtr]
h = torch.tanh(emb.view(-1,30) @W1 + b1)
logits = h@W2 +b2
loss = F.cross_entropy(logits, Ytr)
print(loss)

tensor(2.1138, grad_fn=<NllLossBackward0>)


In [10]:
emb = C[Xte]
h = torch.tanh(emb.view(-1,30) @W1 + b1)
logits = h@W2 +b2
loss = F.cross_entropy(logits, Yte)
print(loss)

tensor(2.1557, grad_fn=<NllLossBackward0>)


In [11]:
#sampling from model:
for _ in range(20):
    out = []
    context = [0] *block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1,-1) @ W1 +b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim = 1)
        ix = torch.multinomial(probs, num_samples = 1).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))

chrie.
julionneth.
kello.
jatijahan.
alla.
brona.
biyabreed.
gemiah.
car.
rael.
kiya.
lmieron.
triellin.
adh.
bello.
yair.
yariana.
zina.
ari.
dun.


In [36]:
## Finding optimal LR NOTES
##creating a range of learning rates
#lre = torch.linspace(-3,0,1000)
#lrs = 10**lre

#lri = [] #recording each learning rate and attributed loss
#lossi = [] #add these to iterating descent cell
##track stats
#lri.append(lre[i])
#lossi.append(loss.item()) #tracking lr and loss

##plot lre vs loss to determine optimal learning rate exponent and thus lr
#plt.plot(lri,lossi)
#plt.plot(stepi,lossi) #look at increasing steps vs. decreasing loss