# Makemore Part4; Become BackProb Ninja

In [1]:
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

words = open('names.txt', 'r').read().splitlines()

In [5]:

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocabSiz = len(itos)


In [6]:

blocksiz=3
def buildDataset(words):
    X, Y = [], []
    for w in words:
        contxt = [0] * blocksiz
        for ch in w + '.':
            ix =stoi[ch]
            X.append(contxt)
            Y.append(ix)
            contxt = contxt[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

xTrain, yTrain = buildDataset(words[:n1])
xDev, yDev = buildDataset(words[n1:n2])
xTest, yTest = buildDataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [7]:

def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdif = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact:{str(ex):5s} | aproximt: {str(app):5s} | maxDiff{maxdif}')


In [8]:
nEmb = 10
nHidden = 64
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocabSiz, nEmb), generator=g)

#layer1 :
w1 = torch.randn((nEmb * blocksiz, nHidden), generator=g) * (5/3)/((nEmb * blocksiz**0.5))
b1 = torch.randn(nHidden, generator=g) * 0.1 #b1 useless cos we got bnBias with bathNorm

#layer2 :
w2 = torch.randn(nHidden, vocabSiz, generator=g) * 0.1
b2 = torch.randn(vocabSiz) * 0.1

#BatchNorm Parameters
bnGain = torch.randn((1, nHidden)) * 0.1 + 1.0
bnBiase = torch.randn((1, nHidden)) * 0.1

#para are init in a Non Starndar Ways 

para = [C, w1, b1, w2, b2, bnGain, bnBiase] #parameters
print(sum(p.nelement() for p in para))
for p in para:
    p.requires_grad = True




4137


In [10]:
n = 32 ## //Batch Size

#construct a minibatch
ix = torch.randint(0, xTrain.shape[0], (n,), generator=g)
xb, yb = xTrain[ix], yTrain[ix] #bath x/y


# Forward Pass, Sliced into smaller steps That are Possible to Backward One at a time

 * Terminology:
   + **de-log**; often refers to taking the **derivative of a logarithmic function**, usually encountered in the context of logarithmic loss functions or any function involving log operations in the forward pass. 

In [None]:
emb = C[xb] #embeding the chars into vector
embCat = emb.view(emb.shape[0], -1) #concatinating the vector

#Linear Layer1
hlPreAct = embCat @ w1 + b1 #hidden layer pre-activation

#Batchnorm Layer :
bnmeanI = 1/n*hlPreAct.sum(0, keepdim=True)
bndiff = hlPreAct - bnmeanI
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) #variance
bnvarInv = (bnvar + 1e-5) ** -0.5
bnraw = bndiff * bnvarInv
hlPreAct = bnGain * bnraw + bnBiase

#None Linearity : Tanh activation
h = torch.tanh(hlPreAct) #hiden layer

#Linear Layer 2 :
logits = h @ w2 + b2 #output Layer

## Cross Entropy Loss[F.cross_entropy] Manual Implimentaion
logitMax = logits.max(1, keepdim=True).values
normLogits = logits - logitMax
counts = normLogits.exp()
countsSum = counts.sum(1, keepdim=True)
countsSumInv = counts**-1 
prob = counts * countsSumInv
logProb = prob.log()
loss = -logProb[range(n), yb].mean()

# Pytorch Backward Pass
for p in para:
    p.grad = None
for t in [
    logProb, prob, counts, countsSum, countsSumInv, normLogits, logitMax, logits, h, hlPreAct, bnraw, 
    bnvar, bnvarInv, bndiff2, bndiff, hlPreAct, bnmeanI, embCat, emb
    ]:
    t.retain_grad()

loss.backward()
loss


