E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [2]:
# E01

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}

In [16]:
context_len = 2
X, Y = [], []

for w in words[:5]:
    context = [0] * context_len

    for l in w + '.':
        ix = stoi[l]
        X.append(context)
        Y.append(ix)
        # print(''.join([itos[i] for i in context]), "->", itos[ix])
        context = context[1:] + [ix]

X, Y = torch.tensor(X), torch.tensor(Y)
X.size(), Y.size()
# X_e = torch.cat((torch.tensor([i[0] for i in X]), torch.tensor([i[1] for i in X])), 0)
X_enc = F.one_hot(X, num_classes=27)

X_enc.size()
X_enc_c = X_enc.view(Y.size(0), -1).float()
X_enc_c

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [4]:
g = torch.Generator().manual_seed(2147483647)
# W is 54x27, because each element in the 56-long encoded X gets a corresponding tensor of 27 probabilities(log-counts in this case)
W = torch.randn((54, 27), generator=g, requires_grad=True)

In [5]:
logits = X_enc_c @ W
counts = logits.exp()
probs = counts/counts.sum(1, keepdim=True)
probs.size()

torch.Size([32, 27])

In [6]:
loss = -probs[torch.arange(probs.size(0)), Y].log().mean()
loss

tensor(4.0230, grad_fn=<NegBackward0>)

In [7]:
loss = -probs[torch.arange(probs.size(0)), Y].log().mean()
loss

tensor(4.0230, grad_fn=<NegBackward0>)

<br>

In [21]:
# Consolidated

context_len = 2
X, Y = [], []

for w in words:
    context = [0] * context_len

    for l in w + '.':
        ix = stoi[l]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X, Y = torch.tensor(X), torch.tensor(Y)

X_enc = F.one_hot(X, num_classes=27)
X_enc_c = X_enc.view(Y.size(0), -1).float()

In [22]:
g = torch.Generator().manual_seed(2147483647)
# W is 54x27, because each element in the 56-long encoded X gets a corresponding tensor of 27 probabilities(log-counts in this case)
W = torch.randn((54, 27), generator=g, requires_grad=True)

In [35]:
# gradient descent

n_iter = 50
l_rate = 30

for i in range(n_iter):
    logits = X_enc_c @ W
    counts = logits.exp()
    probs = counts/counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(probs.size(0)), Y].log().mean()

    W.grad = None
    loss.backward()

    W.data -= l_rate * W.grad

print(loss.item())

# final loss: ~2.35
# total number of iterations: ~250

2.353471040725708


The loss is noticeably lower than if you used a context of 1, but not by a large margin. The decrease in loss is only ~0.1

In [37]:
n_names = 5

for i in range(n_names):
    w = []
    context_1 = [0, 0]
    while True:
        x_enc = F.one_hot(torch.tensor([context_1]), num_classes=27).float()
        x_enc_c = x_enc.view(1, -1).float()
        logits = x_enc_c @ W
        counts = logits.exp()
        probs = counts/counts.sum(1, keepdim=True)

        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        context_1 = context_1[1:] + [ix]
        w.append(itos[ix])
        if ix ==0:
            break
    print(''.join(w))

jmuthanickiyn.
cor.
aryeshaumiylielyna.
afi.
raylaperee.
