In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/ayushuttarwar/makemore-names/names.txt


Trying to implement the algorithm discussed in the research paper, **A Neural Probabilistic Language Model**. 

In the paper they have performed next word prediction but we are going to stick with character level prediction for now.

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('/kaggle/input/datasets/ayushuttarwar/makemore-names/names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [4]:
len(words) # words count

32033

In [5]:
# vocabulary building and mapping
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
# dataset creation
block_size = 3 # context length

X, Y = [], []
for w in words[:5]:
    print('\n', w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # update the context

X = torch.tensor(X)
Y = torch.tensor(Y)


 emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .

 olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .

 ava
... ---> a
..a ---> v
.av ---> a
ava ---> .

 isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .

 sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


Generated training data using all the words in vocabulary

In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

It's just for first 5 example records

Now we need a look up matrix as described in the research paper. There authors have used 30D matrix because they had around 70k possible words. But we will be using 2D matrix since we have only 27 possible chars

In [8]:
# previously we used OHE to convert the mapping to embeddings
# it'll be used for creating the embedding for each mapping
C = torch.rand((27, 2)) # randomly initialized

In [9]:
C[5] # embedding for 5

tensor([0.6827, 0.1142])

In [10]:
# previous way
F.one_hot(torch.tensor(5), num_classes=27)

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [11]:
# now, another way of doing the same thing
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([0.6827, 0.1142])

In [12]:
C[X].shape

torch.Size([32, 3, 2])

In [13]:
X[13, 2] # 13th row and 2nd dim

tensor(1)

In [14]:
# it's representation
C[X][13, 2]

tensor([0.8997, 0.9250])

In [15]:
# same as
C[1]

tensor([0.8997, 0.9250])

In [16]:
embedding = C[X]
embedding.shape

torch.Size([32, 3, 2])

In [17]:
# hidden layer
W1 = torch.randn((6, 100)) # 6 inputs = 3 context * 2 number of each char
b1 = torch.randn(100)

In [18]:
# embedding @ W1 + b1 # won't work, due to incompatible shapes
# embedding[:, 0, :] ---> plucking out the embeddings of first character

# concatenating
torch.cat([embedding[:, 0, :], embedding[:, 1, :], embedding[:, 2, :]], 1).shape

torch.Size([32, 6])

above way isn't perfect because if we change the context size then it'll fail

In [19]:
# solution
torch.cat(torch.unbind(embedding, 1), 1).shape

torch.Size([32, 6])

In [20]:
# another way
h = embedding.view(-1, 6) @ W1 + b1

In [21]:
h.shape

torch.Size([32, 100])

In [22]:
h = torch.tanh(h)

In [23]:
# output layer
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [24]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [25]:
# normalization
counts = logits.exp() # taking log
prob = counts/counts.sum(1, keepdims=True)

In [26]:
prob[torch.arange(32), Y]

tensor([9.4772e-05, 9.9852e-06, 7.0622e-05, 5.9569e-02, 1.3666e-12, 1.0999e-05,
        2.8833e-14, 9.5393e-07, 6.7495e-11, 2.6604e-11, 2.9674e-01, 2.1823e-11,
        2.4872e-03, 1.6410e-10, 5.1098e-05, 1.6577e-12, 5.2779e-08, 2.0729e-11,
        8.0145e-05, 2.5365e-02, 1.8375e-03, 2.1160e-14, 3.7476e-08, 1.0971e-01,
        5.7546e-12, 1.0968e-11, 4.3021e-05, 3.6654e-10, 3.3014e-11, 3.7980e-03,
        2.4308e-07, 4.4441e-11])

In [27]:
loss = - prob[torch.arange(32), Y].log().mean()
loss

tensor(16.2259)

### Current Flow

In [28]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [29]:
g = torch.Generator().manual_seed(2147)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]

In [30]:
# total params
sum(p.nelement() for p in parameters)

3481

In [31]:
for p in parameters:
    p.requires_grad = True

In [32]:
for i in range(50):
    # forward pass
    embedding = C[X] # [32, 3, 2]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = - prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y) # does the same thing
    print(f"loss {i} - {loss.item()}")
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update params
    for p in parameters:
        p.data += -0.1 * p.grad

loss 0 - 19.946651458740234
loss 1 - 16.90178108215332
loss 2 - 14.767463684082031
loss 3 - 12.79350757598877
loss 4 - 11.07797622680664
loss 5 - 9.553297996520996
loss 6 - 8.252640724182129
loss 7 - 7.171950817108154
loss 8 - 6.2747883796691895
loss 9 - 5.556882858276367
loss 10 - 4.990991592407227
loss 11 - 4.485724925994873
loss 12 - 4.016568183898926
loss 13 - 3.587977409362793
loss 14 - 3.208704710006714
loss 15 - 2.8841733932495117
loss 16 - 2.6148922443389893
loss 17 - 2.3880841732025146
loss 18 - 2.188617706298828
loss 19 - 2.0080254077911377
loss 20 - 1.8420310020446777
loss 21 - 1.6887472867965698
loss 22 - 1.547699213027954
loss 23 - 1.4193782806396484
loss 24 - 1.303961157798767
loss 25 - 1.2005606889724731
loss 26 - 1.1081064939498901
loss 27 - 1.0257477760314941
loss 28 - 0.9525542259216309
loss 29 - 0.8873440027236938
loss 30 - 0.8289357423782349
loss 31 - 0.7763463854789734
loss 32 - 0.7288233041763306
loss 33 - 0.6857371926307678
loss 34 - 0.646479606628418
loss 35 - 0

Why the loss is so less because the model is getting overfitted because we have provided only 32 samples to the model. Enables model to memories all the combinations (in a way).  

Therefore, **model is overfitting**

In [33]:
logits.max(1)

torch.return_types.max(
values=tensor([ 4.5276,  8.8723, 11.3670, 18.0619, 16.8425,  4.5276,  7.8883, 14.5229,
        13.8298, 13.4347, 12.2948, 16.8088,  4.5276, 13.0339, 17.0011, 17.2538,
         4.5276,  9.1527, 12.9026, 17.0661, 11.8185, 11.6312, 14.5507, 13.0053,
        18.6138,  4.5276,  7.8701, 12.8536, 13.4554, 14.5056, 13.3688, 17.6940],
       grad_fn=<MaxBackward0>),
indices=tensor([19, 13, 13,  1,  0, 19, 12,  9, 22,  9,  1,  0, 19, 22,  1,  0, 19, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))

Let's train our neural net on whole dataset

In [34]:
# dataset creation
block_size = 3 # context length

X, Y = [], []
for w in words:
    # print('\n', w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # update the context

X = torch.tensor(X)
Y = torch.tensor(Y)

In [35]:
X.shape, Y.shape # dataset

(torch.Size([228146, 3]), torch.Size([228146]))

### Model Training

In [36]:
# init weights
g = torch.Generator().manual_seed(2147)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]

In [37]:
# total params
sum(p.nelement() for p in parameters)

3481

In [38]:
# enable backpropagation
for p in parameters:
    p.requires_grad = True

In [39]:
for i in range(50):
    # forward pass
    embedding = C[X] # [32, 3, 2]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = - prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y) # does the same thing
    print(f"loss {i} - {loss.item()}")
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update params
    for p in parameters:
        p.data += -0.1 * p.grad

loss 0 - 17.469799041748047
loss 1 - 16.451740264892578
loss 2 - 15.551148414611816
loss 3 - 14.729324340820312
loss 4 - 13.973725318908691
loss 5 - 13.286896705627441
loss 6 - 12.662596702575684
loss 7 - 12.090779304504395
loss 8 - 11.574865341186523
loss 9 - 11.112565994262695
loss 10 - 10.695737838745117
loss 11 - 10.323525428771973
loss 12 - 9.991181373596191
loss 13 - 9.688383102416992
loss 14 - 9.407522201538086
loss 15 - 9.144820213317871
loss 16 - 8.898130416870117
loss 17 - 8.666001319885254
loss 18 - 8.447325706481934
loss 19 - 8.241148948669434
loss 20 - 8.046584129333496
loss 21 - 7.862825393676758
loss 22 - 7.689204692840576
loss 23 - 7.525182247161865
loss 24 - 7.3702826499938965
loss 25 - 7.223979949951172
loss 26 - 7.085632801055908
loss 27 - 6.954457759857178
loss 28 - 6.829629421234131
loss 29 - 6.710395812988281
loss 30 - 6.596144676208496
loss 31 - 6.4864068031311035
loss 32 - 6.380835056304932
loss 33 - 6.2791547775268555
loss 34 - 6.181151390075684
loss 35 - 6.086

Now we can see the loss is now way larger than previous one. It's due to the increase in dataset size. Also it's taking time to provide loss at each iteration. It's because we are performing forward and backward pass on whole dataset in each iteration. Usually we divide the data into batches and then perform the operations

Like performing the sampling and dividing the datasets into batches and training the model on each batch of dataset

In [40]:
for i in range(50):
    # mini batch
    ix = torch.randint(0, X.shape[0], (32,))
    
    # forward pass
    embedding = C[X[ix]] # [32, 3, 2]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1) # [32, 100]
    logits = h @ W2 + b2 # [32, 27]
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = - prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y[ix]) # does the same thing
    print(f"loss {i} - {loss.item()}")
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update params
    for p in parameters:
        p.data += -0.1 * p.grad

loss 0 - 4.913791179656982
loss 1 - 5.341272830963135
loss 2 - 4.621477127075195
loss 3 - 5.644758701324463
loss 4 - 5.568265438079834
loss 5 - 4.749841213226318
loss 6 - 4.427259922027588
loss 7 - 5.607346534729004
loss 8 - 5.613461494445801
loss 9 - 4.537456512451172
loss 10 - 5.81048059463501
loss 11 - 4.529202461242676
loss 12 - 4.69546365737915
loss 13 - 4.563920021057129
loss 14 - 4.200355529785156
loss 15 - 5.380074977874756
loss 16 - 4.172908782958984
loss 17 - 3.766218900680542
loss 18 - 4.428376197814941
loss 19 - 4.415846824645996
loss 20 - 3.9466552734375
loss 21 - 4.537609577178955
loss 22 - 4.314426422119141
loss 23 - 5.616531848907471
loss 24 - 4.168978691101074
loss 25 - 4.18571138381958
loss 26 - 4.3689141273498535
loss 27 - 4.293596267700195
loss 28 - 5.0568437576293945
loss 29 - 4.112702369689941
loss 30 - 4.028636932373047
loss 31 - 5.737414836883545
loss 32 - 5.151851654052734
loss 33 - 4.0257368087768555
loss 34 - 4.844351768493652
loss 35 - 3.7057573795318604
los