In [1]:
#NEURAL NETWORK APPROACH TO BIGRAMS

#we train the network to use probabilities to find the character that'll follow a character. 
#we train using loss function - max likelihood function
#training set : all the bigrams

In [2]:
import torch
import torch.nn.functional as F

words = open('names.txt', 'r').read().splitlines() 

In [3]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [4]:
chars = sorted(list(set(''.join(words)))) 
stoi = {s:i+1 for i,s in enumerate(chars)} 

stoi['.'] = 0

In [5]:
#creating training set
xs,ys=[],[] #xs denotes 1st character. ys denote the char that follows

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs=torch.tensor(xs)
ys=torch.tensor(ys)
  

In [6]:
#one_hot vector to feed an integer type input into the neurel net.
#ie. 13 will be a vector of n dim with all 0s except 13th position
#torch has inbuilt func to create onehot vector

import torch.nn.functional as F
xenc=F.one_hot(xs,num_classes=27).float() 
xenc
xenc.shape

torch.Size([228146, 27])

In [24]:
#W=torch.randn((27,1))#1 neuron

W=torch.randn((27,27), requires_grad=True) # 27 neurons
xenc @ W #matrix multiplication in torch is done by @

tensor([[-0.9334,  0.7547, -0.4723,  ...,  0.0846, -1.3133, -0.3960],
        [ 2.3214,  0.8544,  0.8441,  ..., -0.4428, -1.2350, -0.8293],
        [ 0.6428,  1.6423, -0.6300,  ..., -0.6764,  1.1980,  0.6434],
        ...,
        [-0.4630,  0.8062, -0.8079,  ...,  0.8174, -0.4841,  1.0849],
        [-1.3634, -0.5307, -0.1548,  ...,  0.7487, -0.0275,  1.3942],
        [ 0.9246, -0.9739,  1.1087,  ..., -0.0166, -0.9982, -0.0034]],
       grad_fn=<MmBackward0>)

In [25]:
#we want the layer to output log counts (We cant get prob or counts) #Efficiency: probs would be very small.. log
# ???????? Prob using softmax activation func

xenc @ W.exp() #negative numbers are made positive(0-1) and positive numbers are made more positive(>1)

tensor([[ 0.3932,  2.1270,  0.6236,  ...,  1.0883,  0.2689,  0.6730],
        [10.1904,  2.3501,  2.3260,  ...,  0.6422,  0.2908,  0.4363],
        [ 1.9018,  5.1672,  0.5326,  ...,  0.5085,  3.3135,  1.9029],
        ...,
        [ 0.6294,  2.2395,  0.4458,  ...,  2.2647,  0.6163,  2.9592],
        [ 0.2558,  0.5882,  0.8566,  ...,  2.1142,  0.9728,  4.0319],
        [ 2.5208,  0.3776,  3.0305,  ...,  0.9835,  0.3685,  0.9966]],
       grad_fn=<MmBackward0>)

In [26]:
logits=xenc @ W #log-counts
counts=logits.exp() #equivalent to N in the table previously
probs=counts/counts.sum(1,keepdims=True) #this is softmax activation func
probs

tensor([[0.0135, 0.0731, 0.0214,  ..., 0.0374, 0.0092, 0.0231],
        [0.1738, 0.0401, 0.0397,  ..., 0.0110, 0.0050, 0.0074],
        [0.0484, 0.1314, 0.0135,  ..., 0.0129, 0.0842, 0.0484],
        ...,
        [0.0130, 0.0464, 0.0092,  ..., 0.0469, 0.0128, 0.0613],
        [0.0068, 0.0157, 0.0229,  ..., 0.0565, 0.0260, 0.1078],
        [0.0428, 0.0064, 0.0514,  ..., 0.0167, 0.0063, 0.0169]],
       grad_fn=<DivBackward0>)

In [27]:
probs[0] 
#this gives the array of probs of each 27 character following '.'(0th element)

tensor([0.0135, 0.0731, 0.0214, 0.0333, 0.0326, 0.0129, 0.0223, 0.2328, 0.0579,
        0.0605, 0.0415, 0.0390, 0.0110, 0.0152, 0.0346, 0.0362, 0.0135, 0.0304,
        0.0180, 0.0158, 0.0073, 0.0257, 0.0092, 0.0725, 0.0374, 0.0092, 0.0231],
       grad_fn=<SelectBackward0>)

In [39]:
#now we have the output. but it is not optimised. we need to reduce the loss function to get the perfect weights of the neurons

# now comes the use of micrograd.

#Softmax doesnt require bias

# forward pass #activation(xw)
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character

loss = -probs[torch.arange(228146), ys].log().mean() #avg negative log likelihood loss func


In [40]:
print(loss)

tensor(3.7154, grad_fn=<NegBackward0>)


In [29]:
#probs[0,5] #1st bigram .e #ideally should be one
#probs[1,13]#2nd bigram ys shld be m
#instead of checking each bigram individually we do the following
probs[torch.arange(228146),ys] #ideally shld be a array of 1

tensor([0.0129, 0.0599, 0.0434,  ..., 0.0613, 0.0565, 0.0428],
       grad_fn=<IndexBackward0>)

In [37]:
#backward pass
W.grad = None # set to zero the gradient
loss.backward()

In [38]:
#update
W.data += -0.1 * W.grad