# Overview
* Define dataset
* Create vocab
* Define dataset
* Define neural network
* Define data flow inside nn
    * forward pass
        * input
        * get logits
        * softmax
            * exp()
            * normalization
        * calcualte loss func
            * negative log likelihood
    * backward pass(gradient base optimization)
        * update weights
* Combine code together

# Pre-process data

In [1]:
words=open('/kaggle/input/character-lm-without-framework/names.txt','r').read().splitlines()
print(len(words))

32033


# Create vocabulary

In [2]:
chars=sorted(list(set(''.join(words))))
stoi={s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}

# Define dataset

In [3]:
import torch

# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
    chs= ['.'] +list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        print(ch1,ch2)
        xs.append(ix1)
        ys.append(ix2)

# create tensor for data and it's label
xs=torch.tensor(xs)
ys=torch.tensor(ys)
print(xs)
print(ys)
      
num=xs.nelement()
print('number of examples: ', num)

. e
e m
m m
m a
a .
tensor([ 0,  5, 13, 13,  1])
tensor([ 5, 13, 13,  1,  0])
number of examples:  5


# Define Neural Network

In [4]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g= torch.Generator().manual_seed(2147483647)
# one layer 27 width
W=torch.randn((27,27), generator=g)

# Define data flow of neural network

Here we use negative average log likelihood. It same to https://www.kaggle.com/code/aisuko/character-lm-with-pytorch?cellIds=19&kernelSessionId=188052333

## Forward pass

In [5]:
import torch.nn.functional as F

xenc=F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits=xenc @ W # predict log-counts
counts=logits.exp() # counts, equivalent to N
probs=counts/counts.sum(1, keepdims=True) # probabilities for next character(classification)

In [6]:
# list the probabilities of neural network predict of next character
print(xs)
print(ys)

# for first exampel in xs, and it label is 5
print(probs[0,5])
# second row of prob(index 1) we are interested in the probability assigned to index 13
print(probs[1,13])
print(probs[2,13])
print(probs[3,1])
print(probs[4,0])

tensor([ 0,  5, 13, 13,  1])
tensor([ 5, 13, 13,  1,  0])
tensor(0.0123)
tensor(0.0181)
tensor(0.0267)
tensor(0.0737)
tensor(0.0150)


In [7]:
# access these probabilities in more efficient way in pytorch

probs[torch.arange(5),ys]

tensor([0.0123, 0.0181, 0.0267, 0.0737, 0.0150])

In [8]:
loss=-probs[torch.arange(5), ys].log().mean()
print(loss)

tensor(3.7693)


## Backward pass

In [9]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g= torch.Generator().manual_seed(2147483647)
# one layer 27 width
W=torch.randn((27,27), generator=g, requires_grad=True) # default is false

In [10]:
# forward pass
xenc=F.one_hot(xs, num_classes=27).float()
logits=xenc@W
counts=logits.exp()
probs=counts/counts.sum(1, keepdims=True)
loss=-probs[torch.arange(5), ys].log().mean()
print(loss.item())

3.7693049907684326


In [11]:
# backward pass
W.grad=None # set to zero the gradient
loss.backward()

In [12]:
# The influence of that weight(gradient information) on the loss function
W.grad

# W[0,0]+=h will increase 0.0121 because it is gradient positive number

tensor([[ 0.0121,  0.0020,  0.0025,  0.0008,  0.0034, -0.1975,  0.0005,  0.0046,
          0.0027,  0.0063,  0.0016,  0.0056,  0.0018,  0.0016,  0.0100,  0.0476,
          0.0121,  0.0005,  0.0050,  0.0011,  0.0068,  0.0022,  0.0006,  0.0040,
          0.0024,  0.0307,  0.0292],
        [-0.1970,  0.0017,  0.0079,  0.0020,  0.0121,  0.0062,  0.0217,  0.0026,
          0.0025,  0.0010,  0.0205,  0.0017,  0.0198,  0.0022,  0.0046,  0.0041,
          0.0082,  0.0016,  0.0180,  0.0106,  0.0093,  0.0062,  0.0010,  0.0066,
          0.0131,  0.0101,  0.0018],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000

In [13]:
# update the weight of Neural Network
W.data+=-0.1*W.grad

# Combine code together

In [14]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g= torch.Generator().manual_seed(2147483647)
# one layer nn 27 width
W=torch.randn((27,27), generator=g, requires_grad=True)

# gradient descent
for k in range(200):
    #forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits=xenc @ W #predict log-counts
    counts=logits.exp() # counts, equivalent to N
    probs=counts/counts.sum(1, keepdims=True) # probabilities for next character
    loss=-probs[torch.arange(num), ys].log().mean()
    print(loss.item())
    
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()
    
    # update
    W.data+=-0.1*W.grad

3.7693049907684326
3.7492127418518066
3.7291626930236816
3.7091541290283203
3.6891887187957764
3.6692662239074707
3.6493873596191406
3.629552125930786
3.6097614765167236
3.5900158882141113
3.5703155994415283
3.5506606101989746
3.5310521125793457
3.5114905834198
3.491975784301758
3.4725089073181152
3.453089952468872
3.4337196350097656
3.414397716522217
3.3951258659362793
3.375903367996216
3.356731414794922
3.3376102447509766
3.318540096282959
3.2995219230651855
3.2805557250976562
3.26164174079895
3.242781162261963
3.2239737510681152
3.2052199840545654
3.18652081489563
3.1678762435913086
3.1492867469787598
3.1307530403137207
3.1122748851776123
3.09385347366333
3.075488567352295
3.0571811199188232
3.0389316082000732
3.0207395553588867
3.0026066303253174
2.984531879425049
2.9665169715881348
2.948561191558838
2.930665969848633
2.9128310680389404
2.895056962966919
2.8773443698883057
2.8596930503845215
2.842103958129883
2.8245773315429688
2.8071131706237793
2.789712429046631
2.772374868392944

# Simple from the neural net model

In [15]:
g=torch.Generator().manual_seed(2147483647)

for i in range(5):
    out=[]
    ix=0
    while True:
        # -------
        # Before
#         p=P[ix] https://www.kaggle.com/code/aisuko/character-lm-without-framework?cellIds=21&kernelSessionId=187064505
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W # predict log-counts
        counts=logits.exp() # counts, equivalent to N
        p = counts/counts.sum(1, keepdims=True) # probabilities for next character
        
        ix=torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

ouwjdjdja.
pgadjufhqyywema.
emahbrltozia.
oggwt.
ema.


# Acknowledge

* https://www.youtube.com/watch?v=PaCmpygFfXo&t=4649s
* https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part1_bigrams.ipynb