# Multilayer Perceptron (MLP) for the names dataset 

In [39]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [40]:
words = open('data/names.txt', 'r').read().split()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [41]:
len(words)

32033

## Tokenization

In [42]:
chars = sorted(list(set(''.join(words))))
stringToIndex = {char:index + 1 for index, char in enumerate(chars)}
stringToIndex['.'] = 0
indexToString = {index:char for char, index in stringToIndex.items()}
print(indexToString)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [43]:
# Building the dataset
block_size = 3
X, Y = [], []
for word in words:
    # print(word)
    context = [0] * block_size
    for ch in word + '.':
        ix = stringToIndex[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(indexToString[i] for i in context), '------->', indexToString[ix])
        context = context[1:] + [ix] # Crop and append
        
X = torch.tensor(X)
Y = torch.tensor(Y)


In [44]:
# Embedding layer
C = torch.randn((27, 2))
emb = C[X]

# Weights to the hidden layer
W1 = torch.randn((6, 100))
b1 = torch.randn((100,))

## 3 ways to reshape the embedding layer

In [45]:
# Not useful, hard coded. What if my block_size changes?
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1)

tensor([[-0.7281, -0.1343, -0.7281, -0.1343, -0.7281, -0.1343],
        [-0.7281, -0.1343, -0.7281, -0.1343, -0.2918,  1.2143],
        [-0.7281, -0.1343, -0.2918,  1.2143, -0.7647,  0.3684],
        ...,
        [ 0.0138, -1.1718,  0.0138, -1.1718,  1.2702, -0.3712],
        [ 0.0138, -1.1718,  1.2702, -0.3712,  0.0138, -1.1718],
        [ 1.2702, -0.3712,  0.0138, -1.1718,  0.0697,  1.3834]])

In [46]:
# Not efficient, we are creating another tensor
torch.cat(torch.unbind(emb, dim = 1), dim = 1)

tensor([[-0.7281, -0.1343, -0.7281, -0.1343, -0.7281, -0.1343],
        [-0.7281, -0.1343, -0.7281, -0.1343, -0.2918,  1.2143],
        [-0.7281, -0.1343, -0.2918,  1.2143, -0.7647,  0.3684],
        ...,
        [ 0.0138, -1.1718,  0.0138, -1.1718,  1.2702, -0.3712],
        [ 0.0138, -1.1718,  1.2702, -0.3712,  0.0138, -1.1718],
        [ 1.2702, -0.3712,  0.0138, -1.1718,  0.0697,  1.3834]])

In [47]:
# Efficient way
emb.view(-1, block_size * C.shape[1])

tensor([[-0.7281, -0.1343, -0.7281, -0.1343, -0.7281, -0.1343],
        [-0.7281, -0.1343, -0.7281, -0.1343, -0.2918,  1.2143],
        [-0.7281, -0.1343, -0.2918,  1.2143, -0.7647,  0.3684],
        ...,
        [ 0.0138, -1.1718,  0.0138, -1.1718,  1.2702, -0.3712],
        [ 0.0138, -1.1718,  1.2702, -0.3712,  0.0138, -1.1718],
        [ 1.2702, -0.3712,  0.0138, -1.1718,  0.0697,  1.3834]])

In [48]:
h = torch.tanh(emb.view(-1, block_size * C.shape[1])@(W1) + b1)
h.shape

torch.Size([228146, 100])

### Output layer

In [49]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
logits = h@W2 + b2
counts = logits.exp()
prob = counts/counts.sum(1, keepdim=True)
loss = prob[torch.arange(len(Y)), Y].log().neg().mean()

In [50]:
loss

tensor(17.2676)

In [51]:
prob[torch.arange(prob.shape[0]), Y]

tensor([8.3029e-08, 1.8941e-12, 5.0899e-10,  ..., 9.3329e-11, 3.4140e-05,
        5.0707e-05])

In [52]:
prob.shape[0]

228146

# Rewriting everything:

In [53]:
g = torch.Generator().manual_seed(42)
C = torch.randn((27,2))

# Generating random parameters
W1 = torch.randn((block_size * C.shape[1], 100))
b1 = torch.randn((100,))
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
params = [C, W1, b1, W2, b2]
print("number of parameters: " + str(sum(p.numel() for p in params)))
# Forward pass:

# Embedding layer
emb = C[X]

# First hidden layer
h = torch.tanh(emb.view(-1, block_size * C.shape[1])@W1 + b1)

# Output layer
logits = h@W2 + b2

# Softmax layer (We can use cross_entropy easily. But we are doing it manually)
counts = logits.exp()
prob = counts/counts.sum(1, keepdim=True)
loss = prob[torch.arange(len(Y)), Y].log().neg().mean()
print(loss.item())
# Use cross_entropy
loss2 = F.cross_entropy(logits, Y) 
print(loss2.item())

if loss.item() == loss2.item():
    print("Both losses are the same")
else:
    print("The algorithm is estimating the loss in a different way")

number of parameters: 3481
19.002046585083008
19.002044677734375
The algorithm is estimating the loss in a different way


In [54]:
for p in params:
    p.requires_grad_()

In [55]:
# Forward pass:
for _ in range(100):
    # Embedding layer
    emb = C[X]

    # First hidden layer
    h = torch.tanh(emb.view(-1, block_size * C.shape[1])@W1 + b1)

    # Output layer
    logits = h@W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    # Backward pass
    for p in params:
        p.grad = None

    loss.backward()

    for p in params:
        # print(p.grad)
        p.data -= .2 * p.grad

19.002044677734375
15.969666481018066
14.42932415008545
13.266266822814941
12.3394193649292
11.692951202392578
11.033001899719238
10.41457748413086
9.779646873474121
9.237674713134766
8.751880645751953
8.359370231628418
7.953746318817139
7.686043739318848
7.310216426849365
7.117526531219482
6.761959552764893
6.577325344085693
6.305647850036621
6.150050163269043
5.909390926361084
5.7665557861328125
5.56456995010376
5.447741985321045
5.270220756530762
5.17670202255249
5.005305767059326
4.929655075073242
4.769920349121094
4.716645240783691
4.564993381500244
4.5351457595825195
4.3869948387146
4.379265785217285
4.233891487121582
4.246975421905518
4.103461742401123
4.134256362915039
3.9933578968048096
4.038576126098633
3.9004664421081543
3.956969738006592
3.82228684425354
3.8878374099731445
3.7558107376098633
3.827773094177246
3.6978602409362793
3.7737820148468018
3.6449334621429443
3.7230238914489746
3.5956618785858154
3.675748586654663
3.549621820449829
3.631850242614746
3.5073556900024414

## Minibatch + Learning rate

In [56]:
g = torch.Generator().manual_seed(42)
C = torch.randn((27,2))

# Generating random parameters
W1 = torch.randn((block_size * C.shape[1], 100))
b1 = torch.randn((100,))
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
params = [C, W1, b1, W2, b2]
print("number of parameters: " + str(sum(p.numel() for p in params)))

# Hyperparams: 
learning_rate = .5
n_epochs = 10000
batch_size = 128

for p in params:
    p.requires_grad_()
    
for _ in range(n_epochs):
    # Mini-batch construction:
    ix = torch.randint(0, X.shape[0], (batch_size,), generator=g)
    
    # Forward pass:
    # Embedding layer
    emb = C[X[ix]]

    # First hidden layer
    h = torch.tanh(emb.view(-1, block_size * C.shape[1])@W1 + b1)

    # Output layer
    logits = h@W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    # print(loss.item())
    
    # Backward pass
    for p in params:
        p.grad = None
    loss.backward()
    
    # Update
    for p in params:
        # print(p.grad)
        p.data -= learning_rate * p.grad

# Print loss with the whole dataset
# Forward pass:
# Embedding layer
emb = C[X]
h = torch.tanh(emb.view(-1, block_size * C.shape[1])@W1 + b1)
logits = h@W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss.item())

number of parameters: 3481
2.4081270694732666


In [57]:
# lre = torch.linspace(-3, 0, 1000)
# lrs = 10**lre

# lri = []
# lossi = []

# for i in range(1000):
#     ix = torch.randint(0, X.shape[0], (batch_size,), generator=g)
#     emb = C[X[ix]]
#     h = torch.tanh(emb.view(-1, block_size * C.shape[1])@W1 + b1)
#     logits = h@W2 + b2
#     loss = F.cross_entropy(logits, Y[ix])
        
#     for p in params:
#         p.grad = None
#     loss.backward()
#     lr = lrs[i]
#     for p in params:
#         p.data -= lr * p.grad
    
#     lri.append(lr)
#     lossi.append(loss.item())
    
# plt.plot(lre, lossi)

## Train/dev/test split

In [None]:
# Train/dev/test split
# 80% train, 10% dev, 10% test
def build_dataset(words, block_size):
    