In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import random

In [2]:
# opening the file in read mode 
my_file = open("../data/names.txt", "r") 
# reading the file 
data = my_file.read()
# split the names into a list
name_list = data.split('\n')
name_list = ["."+x for x in name_list]

In [3]:
chartoix = {char: ix for ix, char in enumerate(sorted(set("".join(name_list))))}
ixtochar = {ix: char for ix, char in enumerate(sorted(set("".join(name_list))))}

In [4]:
def build_x_y(name_list):
    # Create a dataset with, for each example, all possible X and Y combinations we can extract from it
    block_size = 3
    y_list = []
    x_list = []
    for name in name_list:
        for target_ix, target_letter in enumerate(name):
            previous_letters = name[max(-(block_size)+target_ix,0):target_ix] # This will extract the previous 3 letters
            num_pads = block_size - len(previous_letters)
            previous_letters = "."*num_pads + previous_letters
            #print(previous_letters + "-->" + target_letter)
            y_list.append(chartoix[target_letter])
            x_list.append([chartoix[input_letter] for input_letter in previous_letters])
            
    X = torch.tensor(x_list) # Tensor with 3 dimensional list, input letter index
    Y = torch.tensor(y_list) # Tensor with 1 dimensional output
    return X,Y

In [5]:
ix_80_pct = int(0.8*len(name_list))
ix_90_pct = int(0.9*len(name_list))

In [6]:
X_train, Y_train = build_x_y(name_list[:ix_80_pct])
X_val, Y_val= build_x_y(name_list[ix_80_pct:ix_90_pct])
X_test, Y_test = build_x_y(name_list[ix_90_pct:])

In [7]:
X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape

(torch.Size([182778, 3]),
 torch.Size([182778]),
 torch.Size([22633, 3]),
 torch.Size([22633]),
 torch.Size([22735, 3]),
 torch.Size([22735]))

In [8]:
###### Parameters ########
g = torch.Generator().manual_seed(2147483647)
C = torch.randn(27,10)
W1 = torch.randn(30,300)
#b1 = torch.randn(300) not needed when doing batch norm right after!
# Initializing W2 and b2 with scaled down values ensures we start with a reasonable loss.
W2 = torch.randn(300,27) * 0.1
b2 = torch.randn(27) * 0
bngain = torch.ones((1,300))
bnbias = torch.zeros((1,300))
bnmean_running = torch.zeros((1,300))
bnstd_running = torch.ones((1,300))
parameters = [C, W1, W2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True

In [9]:
losses = []
steps = []

for i in range(1000):
    ###### Minibatch #######
    minibatch_size=32
    minibatch_ix = torch.randint(0,len(X_train),(minibatch_size,))
    ###### Forward pass ########
    embs = C[X_train[minibatch_ix]]
    h = torch.tanh(embs.view(-1,30) @ W1) #+ b1) #embs.view will reshape. No need to have b1 when doing batch norm.
    
    # h is calculated to be shape 32,300 (32 examples).
    batch_mean_i = h.mean(0,keepdims=True) # 1,300 mean over all elements in the batch
    batch_std_i = h.std(0,keepdims=True) # 1,300 mean over all elements in the batch
    
    # Keep running tally of the batch norm statistics. This is seperate from the optimization
    with torch.no_grad():
        bnmean_running = 0.999*bnmean_running + 0.001*batch_mean_i
        bnstd_running = 0.999*bnstd_running + 0.001*batch_std_i
        
    # batch norm
    h = bngain*(h-batch_mean_i)/(batch_std_i+1e-5) + bnbias #now, every neuron in its firing rate will be unit gaussian over these 32 examples.


        
    # keep running mean of the values
    logits = h @ W2 + b2
    
    logits_exp = torch.exp(logits)
    logits_sum = torch.sum(logits_exp,1,keepdim=True)
    logits_sum_inv = logits_sum**-1
    probs = logits_exp * logits_sum_inv
    logprobs = torch.log(probs)
    neglogprobs = -1*logprobs
    loss = neglogprobs[torch.arange(minibatch_size),Y_train[minibatch_ix]].mean()
    #loss = F.cross_entropy(logits,Y_train[minibatch_ix]) #This makes forward/backward pass more efficient and makes things more well behaved numerically for large vals.

    ###### Backward pass ########
    for p in parameters:
        p.grad = None

    for t in [neglogprobs, logprobs, probs, logits_sum_inv, logits_sum, logits_exp, logits]:
        t.retain_grad()
    
    loss.backward()
    break
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr*p.grad
    if i % 100 == 0:
        print(loss.item())
    #steps.append(i)
    #losses.append(loss.item())

In [60]:
logits.shape

torch.Size([32, 27])

In [53]:
logits_sum.shape

torch.Size([32, 1])

In [52]:
logits_exp.shape

torch.Size([32, 27])

In [69]:
dneglogprobs = torch.zeros_like(neglogprobs)
dneglogprobs[torch.arange(minibatch_size),Y_train[minibatch_ix]] = 1/minibatch_size
dlogprobs = -1 * dneglogprobs
dprobs = 1/probs * dlogprobs
dlogits_sum_inv = (logits_exp * dprobs).sum(1,keepdim=True)
dlogits_exp = (logits_sum_inv * dprobs)
dlogits_sum = -1* logits_sum**-2 * dlogits_sum_inv
dlogits_exp += torch.ones_like(logits_exp)*dlogits_sum
dlogits = logits_exp*dlogits_exp

In [70]:
dlogits[0]

tensor([ 5.1253e-05, -3.1228e-02,  2.9213e-07,  7.9683e-05,  2.1118e-02,
         9.2139e-06,  2.5309e-03,  4.4023e-04,  7.0333e-06,  1.7359e-04,
         4.6698e-05,  9.1256e-05,  2.3230e-05,  3.7644e-04,  9.2248e-05,
         4.0541e-04,  5.0628e-05,  3.1329e-04,  1.6579e-04,  3.8377e-03,
         5.7532e-04,  2.6790e-04,  1.5778e-04,  3.3691e-05,  4.0814e-05,
         3.1798e-06,  3.3632e-04], grad_fn=<SelectBackward0>)

In [71]:
logits.grad[0]

tensor([ 5.1253e-05, -3.1228e-02,  2.9213e-07,  7.9683e-05,  2.1118e-02,
         9.2139e-06,  2.5309e-03,  4.4023e-04,  7.0333e-06,  1.7359e-04,
         4.6698e-05,  9.1256e-05,  2.3230e-05,  3.7644e-04,  9.2248e-05,
         4.0541e-04,  5.0628e-05,  3.1329e-04,  1.6579e-04,  3.8377e-03,
         5.7532e-04,  2.6790e-04,  1.5778e-04,  3.3691e-05,  4.0814e-05,
         3.1798e-06,  3.3632e-04])

In [59]:
dlogits_exp

tensor([[ 6.6801e-05, -9.2738e-02,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05],
        [ 1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
         -8.3080e-01,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04],
        [ 3.4915e-04, -3.0056e-02,  3.4915e-04,  3.4915e-04,  3.4915e-04,
          3.4915e-04,  3.4915e-04,  3.4915e-04,  3.4915e-04,  3.4915e-04,
          3.4915e-04,  3.4915e-04,  3.49

In [50]:
logits_exp.grad

tensor([[ 6.6801e-05, -9.2738e-02,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,  6.6801e-05,
          6.6801e-05,  6.6801e-05],
        [ 1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
         -8.3080e-01,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,  1.5977e-04,
          1.5977e-04,  1.5977e-04],
        [ 3.4915e-04, -3.0056e-02,  3.4915e-04,  3.4915e-04,  3.4915e-04,
          3.4915e-04,  3.4915e-04,  3.4915e-04,  3.4915e-04,  3.4915e-04,
          3.4915e-04,  3.4915e-04,  3.49

In [49]:
dlogits_sum == logits_sum.grad

tensor([[True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True]])

In [48]:
logits_sum.grad[0]

tensor([6.6801e-05])

In [27]:
logits_exp.shape, logits_sum_inv.shape

(torch.Size([32, 27]), torch.Size([32, 1]))

In [28]:
probs.shape

torch.Size([32, 27])

In [None]:
logprobs.grad[0]

In [None]:
# validation loss
embs_val = C[X_val]
h_val = torch.tanh(embs_val.view(-1,30) @ W1)
h_val = bngain*(h_val-bnmean_running)/(bnstd_running) + bnbias #now, every neuron in its firing rate will be unit gaussian over these 32 examples.
logits_val = h_val @ W2 + b2
val_loss = F.cross_entropy(logits_val,Y_val) #This makes forward/backward pass more efficient and makes things more well behaved numerically for large vals.
print(val_loss.item())