In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
import numpy as np

In [3]:
import os
os.chdir("../")

In [4]:
import copy

In [5]:
words = open('data/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [6]:
len(words)

32033

In [7]:
## building vocab of chars & mapping to/from integers

chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}

In [8]:
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [10]:
## Building the data set

block_size = 3 ## context length: How many chars to take to predict the next one?

X, Y = [], []

for w in words:
    context = [0] * block_size # -> 1st Iter [0, 0, 0]
    
    for ch in w + ".":

        ix = stoi[ch] ## Encoded values of character
        
        X.append(context) ## 1st Iter [0, 0, 0]
        Y.append(ix) ## 1st Iter [itos[ch]]

        context = context[1:] + [ix] ## Rolling Operation

X = torch.tensor(X)
Y = torch.tensor(Y)

#### Understanding Generated X Shape & Y Shape

1. X shape is (N, Context Size). Where N is number of input samples generated while creating X, Y Pairs and Context Size is defined above

2. Y Shape is (N). For each Input Sample, it contains the next character Output

In [11]:
X.shape, X.dtype, 

(torch.Size([228146, 3]), torch.int64)

In [12]:
Y.shape, Y.dtype

(torch.Size([228146]), torch.int64)

## Projecting characters into 2 Dimensional Space (Embedding Look Up Table)

Number of Chars: 27

Dimensions: 2

In [13]:
C = torch.randn((27, 2))

### Accessing entries of Look-Up Matrix C

In [14]:
## For 5 encoded char "e"
## itos[5]
C[5]

tensor([0.5470, 0.7325])

In [15]:
## For 0 encoded char "a"
## itos[5]
C[0]

tensor([0.2060, 1.3260])

In [16]:
## Is there a better way to index Look Up Matrix?
## Yes One Hot Encoding
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([0.5470, 0.7325])

In [17]:
## First Layer of Neural Language Model is essentially, look up into Embedding matrices for Provide Input

### Getting values for 5th Row & 0th Row simultaneously
C[[5, 0]]

tensor([[0.5470, 0.7325],
        [0.2060, 1.3260]])

In [18]:
### Getting values for 5th Row , 6th Row & 7th Row ,7th Row, 7th Row simultaneously

C[torch.tensor([5, 6, 7, 7, 7])]

tensor([[ 0.5470,  0.7325],
        [ 2.5901,  0.8280],
        [-0.5639, -0.2117],
        [-0.5639, -0.2117],
        [-0.5639, -0.2117]])

In [19]:
X.shape, X.dtype, 

(torch.Size([228146, 3]), torch.int64)

### How can I get Embedding Look Up entries for X Input `[N, Context Size]`

In [20]:
## Slicing X Tensor for Single word emma (1st name in data)
X_single_name = X[0:4, :]

In [21]:
X_single_name

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13]])

In [22]:
## In each row of X
## Number of encoded chars == context sizr
## For each encodeed chars -> (, 2) shape Look-Up Embedding Table output

## End Shape would look like
## (N, Context_Size, Embedding Dim)
C[X_single_name], C[X_single_name].shape

(tensor([[[ 0.2060,  1.3260],
          [ 0.2060,  1.3260],
          [ 0.2060,  1.3260]],
 
         [[ 0.2060,  1.3260],
          [ 0.2060,  1.3260],
          [ 0.5470,  0.7325]],
 
         [[ 0.2060,  1.3260],
          [ 0.5470,  0.7325],
          [-0.5350, -0.9500]],
 
         [[ 0.5470,  0.7325],
          [-0.5350, -0.9500],
          [-0.5350, -0.9500]]]),
 torch.Size([4, 3, 2]))

In [23]:
X[13, 2]

tensor(1)

In [24]:
C[X][13,2]

tensor([-1.5832, -0.8523])

In [25]:
C[1]

tensor([-1.5832, -0.8523])

In [26]:
## We can embedd directly by passing X into C and it should work!

emb = C[X]
emb.shape

torch.Size([228146, 3, 2])

In [27]:
### Constructing Hidden Layer

W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [28]:
### Embedding which is input to Hidden Layer has shape of [N, context, embedding dimension] -> (32, 3, 2)
### While Hidden layer neuron shape is (context * embedding dimension, hidden_dim) -> (6, 100)

### Last 2 dimensions of embedding layer needs to combined together so that shape matches for matmul

### One way to solve is to use torch cat function

torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1).shape

torch.Size([228146, 6])

In [29]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1).shape

torch.Size([228146, 6])

In [30]:
### More generic way would be use torch unbind and then use torch cat
### Unbinding is done on dimension where context size is defined

torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[ 0.2060,  1.3260,  0.2060,  1.3260,  0.2060,  1.3260],
        [ 0.2060,  1.3260,  0.2060,  1.3260,  0.5470,  0.7325],
        [ 0.2060,  1.3260,  0.5470,  0.7325, -0.5350, -0.9500],
        ...,
        [-0.5245,  0.7753, -0.5245,  0.7753,  1.6295, -2.1128],
        [-0.5245,  0.7753,  1.6295, -2.1128, -0.5245,  0.7753],
        [ 1.6295, -2.1128, -0.5245,  0.7753, -0.4277, -0.2231]])

In [31]:
Input_Embedding_Proj = copy.deepcopy(C[X])

In [32]:
Input_Embedding_Proj.shape

torch.Size([228146, 3, 2])

In [33]:
#### Easier way to do this

a = torch.arange(18)
a 

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [34]:
a.view(2, 9)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [35]:
a.view(9, 2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [36]:
emb.shape

torch.Size([228146, 3, 2])

In [38]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

In [39]:
h

tensor([[-0.9998,  0.9995,  0.9913,  ..., -1.0000, -0.9991, -0.9838],
        [-1.0000,  0.9998,  0.9988,  ..., -1.0000, -0.9992, -0.9913],
        [-1.0000,  0.9727,  0.9998,  ..., -0.9930, -0.9999, -0.9981],
        ...,
        [-1.0000,  0.9999,  1.0000,  ..., -0.1978, -0.9654, -0.9983],
        [-0.5329, -0.0233, -0.9484,  ...,  0.9999,  0.9862,  0.8674],
        [ 0.9943, -0.0609, -0.8679,  ...,  0.7585, -1.0000,  0.8208]])

In [40]:
### Output Layer

W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [41]:
logits = h @ W2 + b2

In [42]:
loss = F.cross_entropy(logits, Y)

In [43]:
loss

tensor(18.9258)

#### Training Loop 

In [44]:
X.shape, Y.shape ## DataSet

(torch.Size([228146, 3]), torch.Size([228146]))

In [87]:
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((27, 2), generator=g, requires_grad=True)

W1 = torch.randn((6, 100), generator=g, requires_grad=True)
b1 = torch.randn(100, generator=g, requires_grad=True)

W2 = torch.randn((100, 27), generator=g, requires_grad=True)
b2 = torch.randn(27, generator=g, requires_grad=True)

parameters = [C, W1, b1, W2, b2]

In [88]:
sum(p.nelement() for p in parameters)

3481

In [89]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10 ** lre

In [None]:
for i in range(100):

    # Adding MiniBatch Gradient Descent
    ix = torch.randint(0, X.shape[0], (32,))
    
    # Forward Pass
    emb = C[X[ix]]

    h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

    logits = h @ W2 + b2

    loss = F.cross_entropy(logits, Y[ix])

    print(loss.item())


    ## Backward pass 

    for p in parameters:
        p.grad = None

    loss.backward()

    ## Update

    ## Trying to find optimal learning rate
    for p in parameters:
        p.data += -10 * p.grad

print(loss.item())


17.666305541992188
30.291290283203125
31.603822708129883
50.273311614990234
49.95822525024414
87.09912109375
101.19480895996094
122.49085235595703
115.34754943847656
90.31035614013672
87.30307006835938
88.55764770507812
67.53502655029297
78.29459381103516
92.36698913574219
60.767738342285156
63.85088348388672
88.06515502929688
65.9170913696289
59.66213607788086
48.11647033691406
59.767967224121094
98.26689910888672
59.89116668701172
53.84268569946289
65.12677764892578
59.58260726928711
53.20188522338867
44.91386413574219
49.427120208740234
53.59870147705078
62.19932174682617
72.96302032470703
62.54657745361328
53.9007453918457
47.845558166503906
51.19365310668945
81.28524017333984
39.595436096191406
54.926246643066406
61.035823822021484
34.24104309082031
58.86170196533203
47.409637451171875
48.2187614440918
44.01480484008789
83.16985321044922
52.79801940917969
58.72315979003906
69.07988739013672
77.57203674316406
77.05485534667969
45.63193893432617
77.07035827636719
41.51633834838867
6

In [72]:
emb = C[X]

h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

logits = h @ W2 + b2

loss = F.cross_entropy(logits, Y)

loss

tensor(2.7823, grad_fn=<NllLossBackward0>)

In [50]:
torch.randint(0, X.shape[0], (32,))

tensor([ 48692,  67586,  66684, 214780, 102751, 115361, 105660,  41623, 115015,
         43001, 128232,  80338,  15714, 129476, 136848, 118974,  97678,  43229,
         83395,  96898,  46489, 187445,   4345, 116265,   9145, 120745, 227244,
          2695, 139178,  92462,  27427,  15323])

In [91]:
torch.randint(0, X.shape[0], (32,))

tensor([106901,  77036, 133933,   3455, 225203, 207970, 173065,  53266,  38111,
        181991,  17254,  37707, 218742,  11206,   5349, 220088, 208875, 132827,
         38117, 118856, 123681,  35081, 181288,  97089,  23358, 131305, 116122,
         25171, 197165, 173775, 173288, 112720])

In [None]:
## Training Split, Validation Split, Test Split 
# 80%, 10%, 10%