E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. 
What can you see?

In [1]:
import torch
import torch.nn.functional as F

import random
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
g = torch.Generator().manual_seed(2147483647)


words = open('names.txt', 'r').read().splitlines()

words_train, words_test_val = train_test_split(words, test_size=0.2, random_state=42)
words_val, words_test = train_test_split(words_test_val, test_size=0.5, random_state=42)

chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}

In [2]:
len(words_train), len(words_val), len(words_test)

(25626, 3203, 3204)

In [120]:
# Bigram

W = torch.randn((27, 27), generator=g, requires_grad=True)

def prepare_training_data(corpus):
    xs_train = []
    ys = []
    
    for w in corpus:
      chs = ['.'] + list(w) + ['.']
      for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs_train.append(ix1)
        ys.append(ix2)
          
    
    xs = torch.tensor(xs_train)
    ys = torch.tensor(ys)
    
    x_enc = F.one_hot(xs, num_classes=27).float() # converted to float so that you can feed it into the neural net

    return xs, ys, x_enc

def calculate_loss(xs, ys):
    xs = xs if isinstance(xs, torch.Tensor) else torch.tensor(xs)
    
    logits = xs @ W # log-counts for each next letter
    counts = logits.exp() # proper counts for each next letter
    probs = counts/counts.sum(1, keepdim=True) # normalized probabilities for each next letter
    loss = -probs[torch.arange(probs.size(0)), ys].log().mean() # loss function (vectorized)

    return [loss, probs, W]


def gradient_descent(xs, ys, n_iter=100, l_rate=50):

    for i in range(n_iter):
        # forward
        loss, probs, W = calculate_loss(xs, ys)
        
        # backward
        W.grad = None
        loss.backward()
    
        # update
        W.data -= l_rate * W.grad
    print("Loss:", loss.item())

def generate_names(n_names, ys):
    
    for i in range(n_names):
        w = []
        ix = 0
        while True:
            ix_enc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
            probs = calculate_loss(ix_enc, ys)[1]
            
            ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
            w.append(itos[ix])
            if ix == 0:
                break
        print(''.join(w))

In [121]:
xs_train, ys_train, x_enc_train = prepare_training_data(words_train)
gradient_descent(x_enc_train, ys_train, 10, 40)
generate_names(5, ys_train)

Loss: 2.460386276245117
ren.
cadwikin.
nn.
aly.
jalians.


In [112]:
xs_val, ys_val, x_enc_val = prepare_training_data(words_val)
calculate_loss(x_enc_val, ys_val)[0]

tensor(2.4538, grad_fn=<NegBackward0>)

In [116]:
xs_test, ys_test, x_enc_test = prepare_training_data(words_test)
print(calculate_loss(x_enc_test, ys_test)[0])
generate_names(5, ys_test)

tensor(2.4640, grad_fn=<NegBackward0>)
edeonnta.
ki.
que.
deici.
h.


In [156]:
# Trigram

W = torch.randn((54, 27), generator=g, requires_grad=True)

def tri_prepare_training_data(corpus):
    
    context_len = 2
    X, Y = [], []
    
    for w in words:
        context = [0] * context_len
    
        for l in w + '.':
            ix = stoi[l]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X, Y = torch.tensor(X), torch.tensor(Y)
    
    X_enc = F.one_hot(X, num_classes=27)
    X_enc_c = X_enc.view(Y.size(0), -1).float()
    
    return X, Y, X_enc_c

def tri_calculate_loss(xs, ys):
    xs = xs if isinstance(xs, torch.Tensor) else torch.tensor(xs)
    
    logits = xs @ W # log-counts for each next letter
    counts = logits.exp() # proper counts for each next letter
    probs = counts/counts.sum(1, keepdim=True) # normalized probabilities for each next letter
    loss = -probs[torch.arange(probs.size(0)), ys].log().mean() # loss function (vectorized)

    return [loss, probs, W]


def tri_gradient_descent(xs, ys, n_iter=100, l_rate=50):

    for i in range(n_iter):
        # forward
        loss, probs, W = calculate_loss(xs, ys)
        
        # backward
        W.grad = None
        loss.backward()
    
        # update
        W.data -= l_rate * W.grad
    print("Loss:", loss.item())

def tri_generate_names(n_names, ys):
    
    for i in range(n_names):
        w = []
        context = [0, 0]
        while True:
            ix_enc = F.one_hot(torch.tensor([context]), num_classes=27).float()
            ix_enc_c = ix_enc.view(1, -1).float()
            
            probs = calculate_loss(ix_enc_c, ys)[1]
            
            ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
            context = context[1:] + [ix]
            w.append(itos[ix])
            if ix == 0:
                break
        print(''.join(w))

In [190]:
X_train, Y_train, X_e_train = tri_prepare_training_data(words_train)

# tri_gradient_descent(X_e_train, Y_train, 250, 30)

print("Loss:", tri_calculate_loss(X_e_train, Y_train)[0])

tri_generate_names(5, Y_train)

Loss: tensor(2.3530, grad_fn=<NegBackward0>)
kathey.
jo.
siyn.
kordaia.
rea.


In [None]:
# (I was too lazy to do the validation stuff here, you get it, i'll get to it later(never))