In [1]:
#at initialisation of weights the weights are more likely then not tuned in such a way that the probs
#obtained at the very end is in a distribution where the intended outputs prob is very low and those
#of the other are very high, hence more often then not, at the start when the model is untrained 
#the model is "confidently wrong", so the intention is to set the weights in such a way at the start
#so that the all the logits are close to 0, and not in a way where some are extreme and other are not
#so

In [None]:
#when we start off with a "confidently wrong" model at the very start, we get very high losses
#and for the first few epochs the aim of the model during training indirectly becomes to 
#squash/ make the logits uniform enough and then focus on rearranging the logits to make it match the
#intended output, but the gain with having a model starting off in a intermediate stage(in the sense that
#the initial logits are values close to 0) is that the number of training steps reqd are lesser and 
#we dont end up in a valley in the parameter-cost curve which does not have any local minima in its 
#proximity, which is not what we want, now in this case since we spend way more epochs rearranging the
#logits to get the optimal trained model, and not squashing it for a lot of the first few epochs, 
#training becomes more productive

In [8]:
import torch
import torch.nn.functional as F

In [9]:
words = open('names.txt', 'r').read().splitlines()

In [5]:
words

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn',
 'abigail',
 'emily',
 'elizabeth',
 'mila',
 'ella',
 'avery',
 'sofia',
 'camila',
 'aria',
 'scarlett',
 'victoria',
 'madison',
 'luna',
 'grace',
 'chloe',
 'penelope',
 'layla',
 'riley',
 'zoey',
 'nora',
 'lily',
 'eleanor',
 'hannah',
 'lillian',
 'addison',
 'aubrey',
 'ellie',
 'stella',
 'natalie',
 'zoe',
 'leah',
 'hazel',
 'violet',
 'aurora',
 'savannah',
 'audrey',
 'brooklyn',
 'bella',
 'claire',
 'skylar',
 'lucy',
 'paisley',
 'everly',
 'anna',
 'caroline',
 'nova',
 'genesis',
 'emilia',
 'kennedy',
 'samantha',
 'maya',
 'willow',
 'kinsley',
 'naomi',
 'aaliyah',
 'elena',
 'sarah',
 'ariana',
 'allison',
 'gabriella',
 'alice',
 'madelyn',
 'cora',
 'ruby',
 'eva',
 'serenity',
 'autumn',
 'adeline',
 'hailey',
 'gianna',
 'valentina',
 'isla',
 'eliana',
 'quinn',
 'nevaeh',
 'ivy',
 'sadie',
 'piper',
 'lydia',
 'alexa',
 'josephine',
 'emery',
 'julia'

In [10]:
chars = sorted(list(set(''.join(words))))

atoi = {ch: i+1 for i, ch in enumerate(chars)}
atoi['.'] = 0

itoa = {i: ch for ch, i in atoi.items()}

In [12]:
block_size = 3

def build_dataset(words):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in (w+'.'):
            ix = atoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))   #splitting of the datset of words, then from those split datasets we make the dataset for our use
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])

Xval, Yval = build_dataset(words[n1:n2])

Xte, Yte = build_dataset(words[n2:])

In [8]:
Xtr.shape, Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [11]:
g = torch.Generator().manual_seed(2147483647)

In [49]:
#init values of weights and biases, since we are also modifying the tensors separately, we
#set req graident to be true later on
C = torch.randn([27, 10], generator=g)
W1 = torch.randn([30, 200], generator=g) * (5/3)/(30)**0.5  #this is the process of kaiming
#init from the paper kaiming et al 2015 which delves into rectifiers/act funcs, here the fundamental
#is that we want the values coming out from any layer at init only to have mean of 0 and std of 1
#now w@x + b might not always have std of 1 but a higher value, assuming x and b are also normal
#so to offset that we multiply the init val of w by gain/rt(fan_in)  where gain depends on the type
#of non linear activation used and fan_in is the number of neurons which feed itself in into the 
#current layer, gain for tanh activation is 5/3, for ReLU its rt(2) and for the others generic ones its 1


#b1 = torch.rand(200, generator=g) * 0.01  #so as to make the num which goes in the tanh func be close to 
#0 which prevents it from hitting the dead state
W2 = torch.randn([200, 27], generator=g) * 0.01
b2 = torch.rand(27, generator=g) * 0

bngain = torch.ones((1, 200))
bnbias = torch.zeros((1, 200))
 
bnmean_running = torch.zeros((1, 200))
bnstd_running = torch.ones((1, 200))

parameters = [C, W1, W2, b2, bnbias, bngain]

#now setting require_grad to be true
for p in parameters:
    p.requires_grad = True;

In [None]:
#gradient descent step
for _ in range(200000):
    #minibatch of 32 batches selected randomly
    ix = torch.randint(0, Xtr.shape[0], [32]) 

    #forawrd pass
    emb = C[Xtr[ix]]     #embedding the context
    hpreact = emb.view(-1, W1.shape[0]) @ W1 #+ b1 no need of this bias as its being subtracted
    #later on while performing batchNorm, instead bnbias takes over its role
    
    #batchnorm step
    bnmean = hpreact.mean(0, keepdim=True)
    bnstd = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmean / bnstd) + bnbias

    with torch.no_grad():
        bnmean_running = 0.999*bnmean_running + 0.001*bnmean  #only makes sense because if we 
        #just add the new means every time the values would be some blown up number and not even
        #something which might resemble the mean for all of the samples, hence this is done. Also
        #we cant take the final value only as all of them are taken from samples and we cant assume
        #that the final sample is the most important one, hence again, this approach of partial 
        #increment is the most sensible one, and the same goes for std. Here also this value of 0.001
        #is the momentum, where if we have smaller batch sizes, then its good to have smaller 
        #momentum, as for smaller batch sizes the mean and std could vary around a lot, and to 
        #make the val of mean/std converge, we incremenet the changes but by a small step, but for 
        #larger batch sizes, we expect the mean/std to not vary by a lot each time, hence the 
        #momentum can be large enough.
        bnstd_running = 0.999*bnstd_running + 0.001*bnstd

    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    nll = F.cross_entropy(logits, Ytr[ix])
    if (_%10000 == 0):
        print(f"loss at epoch {_+1}: {nll.item()}")   #loss for just that minibatch
     
    #backward pass
    #setting all the params grads to 0 before backpropagating
    for p in parameters:
        p.grad = None
    nll.backward()  #backpropagating the loss

    #update the parameters
    lr = 0.1 if _ <100000 else 0.01
    for p in parameters: 
        if p.grad is not None:
            p.data += -lr * p.grad

    # break

loss at epoch 1: 3.2742714881896973
loss at epoch 10001: 2.1143321990966797
loss at epoch 20001: 2.3284220695495605
loss at epoch 30001: 2.181339979171753
loss at epoch 40001: 2.179100275039673
loss at epoch 50001: 1.8406212329864502
loss at epoch 60001: 2.0422685146331787
loss at epoch 70001: 2.2556426525115967
loss at epoch 80001: 2.310425043106079
loss at epoch 90001: 2.034607172012329
loss at epoch 100001: 2.2446584701538086
loss at epoch 110001: 1.8314517736434937
loss at epoch 120001: 2.0750114917755127
loss at epoch 130001: 2.1405625343322754
loss at epoch 140001: 2.1285812854766846
loss at epoch 150001: 1.8639581203460693
loss at epoch 160001: 1.9118980169296265
loss at epoch 170001: 1.7090238332748413
loss at epoch 180001: 1.8510971069335938
loss at epoch 190001: 2.0543015003204346


In [51]:
@torch.no_grad
def split_loss(split):
    x, y = {
        'train': (Xtr, Ytr),
        'test': (Xte, Yte),
        'val': (Xval, Yval)
    }[split]
    emb = C[x]
    hpreact = emb.view(-1, W1.shape[0]) @ W1 + b1
    hpreact = bngain * (hpreact - bnmean_running / bnstd_running) + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    nll = F.cross_entropy(logits, y)
    print(f"{split} loss: {nll}")

split_loss('train')
split_loss('test')
split_loss('val')
    

train loss: 2.0459465980529785
test loss: 2.1061246395111084
val loss: 2.10913348197937


In [1]:
#creating generlisation 

In [41]:
#a general linear layer
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5  #kaiming init of weights
        self.bias = torch.randn((fan_out), generator=g) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias]) 
    


#a general batchNorm layer
class BatchNorm1d:
    def __init__(self, dim, momentum=0.1, eps=1e-5):  #dim is the num of neurons in the layer
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):  #x is the samples of the batch 
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:  #the case of inference where running mean and std is used for inference
            xmean = self.running_mean
            xvar = self.running_var

        self.out = self.gamma * (x - xmean/torch.sqrt(xvar + self.eps)) + self.beta

        if self.training:  #if training occurs, then keep updating the running mean and std
            with torch.no_grad():
                self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*xmean
                self.running_var = (1-self.momentum)*self.running_var + self.momentum*xvar
        
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]



#a general tanh layer
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []    

In [44]:
vocab_size = 27       #27 chars to encode 
block_size = 3        #3 chars as context 
n_embd = 10           #dimensionality of embedding space
n_hidden = 100        #number of dimensions of hidden layer
g = torch.Generator().manual_seed(2147483644)   #generator for uniformity of randomness across trials

C = torch.rand((vocab_size, n_embd), generator=g)
layers = [
    Linear(n_embd * block_size, n_hidden), 
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden), 
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    BatchNorm1d(n_hidden), 
    Tanh(),
    Linear(n_hidden, n_hidden), 
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden), 
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, vocab_size), 
    BatchNorm1d(vocab_size)
]
#by default, all of the weights are kaiming initialised, where for the init case, the num of 
#neurons in the saturated region(which makes the neuron after passing through the non linear act func dead)
#are lower

#some more init setting up of weights
with torch.no_grad():
    #letting the logits layer be less confidently wrong by letting the last layer of weights be
    #scaled down by a factor of 0.1
    layers[-1].gamma *= 0.1

    #also since we apply the tanh act func to all the *hidden layer* , we need to multiply the 
    #kaiming init weights by its appropriate gain of 5/3
    for layer in layers[:-1]:  #excluding the last output layer
        if isinstance(layer, Linear):
            layer.weight *= 5/3  #and again 5/3 is used as a gain for the weights used in 
            #the linear layer as the tanh func squishes all the values close to 0, so
            #over multiple layers, this results in more and more squished vals, and so as
            #to combat it to a certain extent, we boost it by the gain
    
parameters = [C] + [p for layer in layers for p in layer.parameters()]

for p in parameters:
    p.requires_grad = True

print(sum(p.nelement() for p in parameters))

47551


In [46]:
max_epochs = 200000
batch_size = 32
lossi = []  #appending the log(nll) for each epoch 

for i in range(max_epochs):
    #making the minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g)  #get index of chosen samples
    Xb, Yb = Xtr[ix], Ytr[ix]  #batch X, Y

    #forward pass
    emb = C[Xb]  #char embedding
    x = emb.view(emb.shape[0], -1)  #this is the input layer essentially of shape (batch_size, )
    for layer in layers:
        x = layer(x)  #egs passed from one layer to the other 
    nll = F.cross_entropy(x, Yb)  #loss

    #backward pass
    for layer in layers: #capturing the grads
        layer.out.retain_grad()
    for p in parameters:
        p.grad = None
    nll.backward()

    #update
    alpha = 0.1 if i <100000 else 0.01
    for p in parameters:
        p.data -= alpha*p.grad

    #track stats
    if i%10000 == 0:
        print(f"{i:7d}/{max_epochs:7d}: {nll.item():.4f}")
    lossi.append(nll.log10().item())   

      0/ 200000: 3.5358
  10000/ 200000: 2.2921
  20000/ 200000: 2.0982
  30000/ 200000: 2.0721
  40000/ 200000: 2.2450
  50000/ 200000: 1.9345
  60000/ 200000: 2.0556
  70000/ 200000: 1.9227
  80000/ 200000: 2.1679
  90000/ 200000: 1.9471
 100000/ 200000: 2.4487
 110000/ 200000: 2.1672
 120000/ 200000: 1.9262
 130000/ 200000: 2.0644
 140000/ 200000: 2.3572
 150000/ 200000: 1.7786
 160000/ 200000: 2.1733
 170000/ 200000: 1.8967
 180000/ 200000: 1.9712
 190000/ 200000: 1.8412
