In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import random
words = open("names.txt", 'r').read().splitlines()

In [2]:
N = torch.zeros((27,27), dtype=torch.int32)
chars = sorted(list(set(''.join(words))))
c_to_i = {c:i+1 for i,c in enumerate(chars)}
c_to_i['.'] = 0
i_to_c = {i+1:c for i,c in enumerate(chars)}
i_to_c[0] = '.'

In [3]:
def generate_dataset(word_list, block_length):
    X, Y = [], []

    for word in word_list:
        vector = [0] * block_length
        for c in word + '.':
            val = c_to_i[c]
            X.append(vector)
            Y.append(val)
            vector = vector[1:] + [val]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X, Y

In [4]:
block_size = 3
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
X_train, Y_train = generate_dataset(words[:n1], block_size)
X_val, Y_val = generate_dataset(words[n1:n2], block_size)
X_test, Y_test = generate_dataset(words[n2:], block_size)

In [142]:
class Linear:
    def __init__(self, fan_in, fan_out, has_bias=True):
        self.weights = torch.randn([fan_in, fan_out]) / fan_in ** 0.5
        self.bias = torch.zeros([1, fan_out]) if has_bias else None
    
    def __call__(self, x):
        self.out = x @ self.weights
        if self.bias != None:
            self.out += self.bias 
        return self.out
    
    def parameters(self):
        if self.bias != None:
            return [self.weights] + self.bias
        return [self.weights]

In [161]:
class BatchNorm1D:
    def __init__(self, fan_in, epsilon=1e-5, momentum=0.99):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        
        self.gamma = torch.ones([1, fan_in])
        self.beta = torch.zeros([1, fan_in])
        
        self.moving_average = torch.zeros([1, fan_in])
        self. moving_std = torch.ones([1, fan_in])
        
    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, True)
            xstd = x.std(0, True)
        else:
            xmean = self.moving_average
            xstd = self.moving_std
            
        self.out = self.gamma * (x - xmean) / (xstd + self.epsilon) + self.beta
    
        if self.training:
            with torch.no_grad():
                self.moving_average = momentum * self.moving_average + (1 - self.momentum) * xmean
                self.moving_std = momentum * self.moving_std + (1 - self.momentum) * xstd
        
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [170]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [149]:
#weights: fan_in, fan_out
#bias:         1, fan_out
test = Linear(7, 8)
#test(torch.randn([3,7]))
z = torch.randn([3,7])
test(z)

tensor([[-0.4170,  0.3877, -0.6296, -0.5448, -0.4513,  1.1802, -0.1889,  1.0407],
        [ 0.5638,  0.5003,  0.5314, -0.6617, -0.4820, -0.9803,  0.6434,  1.1088],
        [-1.8971,  0.0853, -0.9836, -1.4692,  0.7974,  2.0882, -1.0930, -0.6465]])

In [105]:
num_emb = 10
num_neurons1 = 200

C = torch.randn([27,num_emb])

W1 = torch.randn([num_emb * block_size, num_neurons1]) * (5/3) / 10**0.5
b1 = torch.randn(num_neurons1) * 0.01

W2 = torch.randn((num_neurons1,27)) * 0.01
b2 = torch.randn(27) * 0

bngain = torch.ones([1,num_neurons1])
bnbias = torch.zeros([1, num_neurons1])

moving_average = torch.zeros([1, num_neurons1])
moving_std = torch.ones([1, num_neurons1])

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True

In [111]:
loss_t = []
momentum = 0.999
epsilon = 1e-5

for i in range(50000):
    x_inds = torch.randint(0, X_train.shape[0], (32,))
    
    raw_emb = C[X_train[x_inds]]
    emb = raw_emb.view(raw_emb.shape[0], num_emb * block_size)
    h1 = torch.tanh(emb @ W1 + b1)
    bnmeani = h1.mean(0, True)
    bnstdi = h1.std(0, True)
    h1 = bngain * (h1 - bnmeani) / (bnstdi + epsilon) + bnbias
    logits = h1 @ W2 + b2
    loss = F.cross_entropy(logits, Y_train[x_inds])
    
    with torch.no_grad():
        moving_average = momentum * moving_average + (1 - momentum) * bnmeani
        moving_std = momentum * moving_std + (1 - momentum) * bnstdi
    
    #loss_t.append(loss.item())
    
    for p in parameters:
        p.grad = None
        
    loss.backward()
    
    for p in parameters:
        #p.data += -lrs[i] * p.grad
        p.data += -0.01 * p.grad

In [112]:
with torch.no_grad():
    raw_emb = C[X_train]
    emb = raw_emb.view(raw_emb.shape[0], num_emb * block_size)
    h1 = torch.tanh(emb @ W1 + b1)
    mu = h1.mean(0, True) 
    sigma = h1.std(0, True)

In [113]:
@torch.no_grad()
def test(split):
    X_data, Y_data = {
        "test" : (X_test, Y_test),
        "val": (X_val, Y_val)
    }[split]
    raw_emb = C[X_data]
    emb = raw_emb.view(raw_emb.shape[0], num_emb * block_size)
    h1 = torch.tanh(emb @ W1 + b1)
    h1 = bngain * (h1 - moving_average) / moving_std + bnbias
    logits = h1 @ W2 + b2
    loss = F.cross_entropy(logits, Y_data)
    
    print(loss)

In [114]:
#plt.plot(lrs, loss_t)
#plt.plot(np.arange(10000),loss_t)
test("test")
test("val")

tensor(2.1770)
tensor(2.1896)
