In [90]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [91]:
words = open('names.txt', 'r').read().splitlines()
len(words)

32033

In [92]:
chars = sorted(list(set(''.join(words))))
chars = ['.'] + chars
stoi = {n:i for i,n in enumerate(chars)}
itos = {n:i for i,n in stoi.items()}

In [93]:
def build_dataset(words):
    block_size = 3
    X, Y = [], []
    for w in words:
        context = '.'*block_size
        for i, ch in enumerate(w + '.'):
            curr_x = context
            req = block_size - len(context)
            temp_w = w[:i]
            temp_w = temp_w[len(temp_w)-req:]
            curr_x += temp_w
            context = context[:-1]
    #         print(curr_x, "---->", curr_y)
            X.append([stoi[x_ch] for x_ch in curr_x])
            Y.append(stoi[ch])
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

# X = torch.tensor(X)
# Y = torch.tensor(Y)
# X[0], X.shape, Y.shape

import random
random.seed(40)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

Xtr.shape, Ytr.shape

(torch.Size([182469, 3]), torch.Size([182469]))

In [6]:
# C = torch.randn((len(chars), 2))

In [7]:
# emb = C[X]
# emb.shape

In [8]:
# W1 = torch.randn((6, 100), requires_grad=True)
# b1 = torch.randn(100)
# W1.shape, b1.shape

In [9]:
### Exploring pytorch ###

# torch.cat(torch.unbind(emb, 1), 1)
# torch.unbind(emb, 1)
# emb.storage()
# emb.view(12, 6)
    # emb.view(-1, 6) # -> here -1 take cares about the remaining count like 12*6/6

In [10]:
# A1 = torch.tanh(emb.view(-1, 6) @ W1 + b1)
# A1.shape

In [11]:
# W2 = torch.randn((100, len(chars)), requires_grad=True)
# b2 = torch.randn(len(chars))

In [12]:
######## my try ##########

# logits = A1 @ W2 + b2
# counts = logits.exp()
# probs = counts / counts.sum(1, keepdims=True)
# loss = -probs[torch.arange(12), Y].log().mean()
# print(loss)

# W2.grad = None
# W1.grad = None
# loss.backward(retain_graph=True)

# W2.data += -10*W2.grad
# W1.data += -10*W1.grad

###########################

In [13]:
######## Making respectable :) ########

In [94]:
Xtr.shape, Ytr.shape

(torch.Size([182469, 3]), torch.Size([182469]))

In [95]:
# parameters
C = torch.randn((len(chars), 2))
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
W2 = torch.randn((100, len(chars)))
b2 = torch.randn(len(chars))
parameters = [C, W1, b1, W2, b2]

In [96]:
# setting require_grad to True for parameters
for p in parameters:
    p.requires_grad = True
# parameters in total
sum(p.nelement() for p in parameters)

3481

In [48]:
# for _ in range(10):
#     ## forward
#     emb = C[X]
#     A1 = torch.tanh(emb.view(-1, 6) @ W1 + b1)
#     logits = A1 @ W2 + b2

#     # # now applying softmax
#     # counts = logits.exp()
#     # probs = counts / counts.sum(1, keepdims=True)
#     # # calculating loss by mean of negative log likelihoods
#     # loss = -probs[torch.arange(12), Y].log().mean()

#     # In above lines we are calculating softmax and then loss
#     # This is common and we do have a pytorch inbuilt function pytorch.nn.functional.cross_entropy()
#     # >> Which computes cross entropy between input and target (here logits and Y)...

#     loss = F.cross_entropy(logits, Y)

#     ## backward
#     for p in parameters:
#         p.grad = None
#     loss.backward()

#     ## updation
#     for p in parameters:
#         p.data += -0.1 * p.grad

#     print(loss.item())

In [97]:
## Exploring about learning rates

# lets say learning rates are
lr_exponential = torch.linspace(-3, 0, 1000)
lrates = 10 ** lr_exponential

In [102]:
## keeping track of learning rates and respective losses
lr_index, loss_index = [], []

for i in range(10000):
    ## training through batches - batch construct
    ix = torch.randint(0, Xtr.shape[0], (32,))
    
    ## forward
    emb = C[Xtr[ix]]
    A1 = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = A1 @ W2 + b2

    # # now applying softmax
    # counts = logits.exp()
    # probs = counts / counts.sum(1, keepdims=True)
    # # calculating loss by mean of negative log likelihoods
    # loss = -probs[torch.arange(12), Y].log().mean()

    # In above lines we are calculating softmax and then loss
    # This is common and we do have a pytorch inbuilt function pytorch.nn.functional.cross_entropy()
    # >> Which computes cross entropy between input and target (here logits and Y)...

    loss = F.cross_entropy(logits, Ytr[ix])

    ## backward
    for p in parameters:
        p.grad = None
    loss.backward()

    ## normal updation
#     for p in parameters:
#         p.data += -0.1 * p.grad
    
    ## learning rate updation
#     lr = lrates[i]
    lr = 10 ** -1
    for p in parameters:
        p.data += -lr * p.grad
    
    ## keeping track of lr, loss
#     lr_index.append(lr_exponential[i])
#     loss_index.append(loss.item())

#     print(loss.item())
print(loss.item())

2.3479912281036377


In [84]:
# plt.plot(lr_index, loss_index)

# ## from graph we can observe that, -1.0 is a pretty good exponential power for our learning rate

In [103]:
## calculating loss for full training set

emb = C[Xdev]
A1 = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = A1 @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())

2.3685553073883057


In [None]:

## In some cases, neural networks are very powerful to even memorize all the training set - oversetting

# Due to these reasons, we do care a lot about dividing total dataset into 3
# >> Training Split, Dev/Validation Split, Test Split
# >> 80 % , 10 % , 10 %

# We are given a very few times to check loss on dev/validation set  (unseen data for the model)
# And a single time run for Test Set - which is gonna be the performance metrics for your model, which is used in paper, etc..

# Training and Dev losses are nearly equal then we are not overfitting the model - It's good