In [82]:
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from graphviz import Digraph
import torch
import torch.nn.functional as F



In [61]:
words = open('names.txt', 'r').read().splitlines()

for word in words[:10]:
  print(word)

emma
olivia
ava
isabella
sophia
charlotte
mia
amelia
harper
evelyn


In [62]:

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
# print(itos)

def build_dataset(words, block_size):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  # print(X.shape, Y.shape)
  return X, Y

X, Y = build_dataset(words[:1], block_size=3)
print(X.shape, Y.shape)
print(X)
print(Y)


torch.Size([5, 3]) torch.Size([5])
tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]])
tensor([ 5, 13, 13,  1,  0])


In [63]:
C = torch.randn((27,2))
emb = C[X]
print(C.shape)
print(X.shape)
print(emb.shape)



torch.Size([27, 2])
torch.Size([5, 3])
torch.Size([5, 3, 2])


In [64]:
[emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]]


[tensor([[-0.1854,  1.2178],
         [-0.1854,  1.2178],
         [-0.1854,  1.2178],
         [-0.5137,  0.9807],
         [ 0.7629, -0.9416]]),
 tensor([[-0.1854,  1.2178],
         [-0.1854,  1.2178],
         [-0.5137,  0.9807],
         [ 0.7629, -0.9416],
         [ 0.7629, -0.9416]]),
 tensor([[-0.1854,  1.2178],
         [-0.5137,  0.9807],
         [ 0.7629, -0.9416],
         [ 0.7629, -0.9416],
         [ 1.0453, -0.1175]])]

In [65]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1)

tensor([[-0.1854,  1.2178, -0.1854,  1.2178, -0.1854,  1.2178],
        [-0.1854,  1.2178, -0.1854,  1.2178, -0.5137,  0.9807],
        [-0.1854,  1.2178, -0.5137,  0.9807,  0.7629, -0.9416],
        [-0.5137,  0.9807,  0.7629, -0.9416,  0.7629, -0.9416],
        [ 0.7629, -0.9416,  0.7629, -0.9416,  1.0453, -0.1175]])

In [66]:
torch.unbind(emb, dim=1)

(tensor([[-0.1854,  1.2178],
         [-0.1854,  1.2178],
         [-0.1854,  1.2178],
         [-0.5137,  0.9807],
         [ 0.7629, -0.9416]]),
 tensor([[-0.1854,  1.2178],
         [-0.1854,  1.2178],
         [-0.5137,  0.9807],
         [ 0.7629, -0.9416],
         [ 0.7629, -0.9416]]),
 tensor([[-0.1854,  1.2178],
         [-0.5137,  0.9807],
         [ 0.7629, -0.9416],
         [ 0.7629, -0.9416],
         [ 1.0453, -0.1175]]))

In [67]:
torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[-0.1854,  1.2178, -0.1854,  1.2178, -0.1854,  1.2178],
        [-0.1854,  1.2178, -0.1854,  1.2178, -0.5137,  0.9807],
        [-0.1854,  1.2178, -0.5137,  0.9807,  0.7629, -0.9416],
        [-0.5137,  0.9807,  0.7629, -0.9416,  0.7629, -0.9416],
        [ 0.7629, -0.9416,  0.7629, -0.9416,  1.0453, -0.1175]])

In [68]:
emb.view(emb.shape[0], -1)

tensor([[-0.1854,  1.2178, -0.1854,  1.2178, -0.1854,  1.2178],
        [-0.1854,  1.2178, -0.1854,  1.2178, -0.5137,  0.9807],
        [-0.1854,  1.2178, -0.5137,  0.9807,  0.7629, -0.9416],
        [-0.5137,  0.9807,  0.7629, -0.9416,  0.7629, -0.9416],
        [ 0.7629, -0.9416,  0.7629, -0.9416,  1.0453, -0.1175]])

In [None]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [70]:
h = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1)
print(h.shape)

torch.Size([5, 100])


In [74]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

logits = h @ W2 + b2
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss = -probs[torch.arange(probs.shape[0]), Y].log().mean()
print(loss)

tensor(17.8618)


In [93]:
# Made it clean
block_size = 3  
batch_size = 32
emd_dim = 2
hidden_dim = 100
learning_rate = 1e-2
learning_steps = 10000

X, Y = build_dataset(words, block_size=block_size)


g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, emd_dim), generator=g)
W1 = torch.randn((6, hidden_dim), generator=g)
b1 = torch.randn(hidden_dim, generator=g)
W2 = torch.randn((hidden_dim, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

for i in range(learning_steps):

    # minibatch
    ix = torch.randint(0, X.shape[0], (batch_size,))

    # forward pass
    emb = C[X[ix]]
    h = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    print(i, loss.data)

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -learning_rate * p.grad

# last forward pass
emb = C[X]
h = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print("last", loss.data)

0 tensor(21.4013)
1 tensor(19.4396)
2 tensor(18.7556)
3 tensor(17.0685)
4 tensor(15.5378)
5 tensor(17.5943)
6 tensor(17.3678)
7 tensor(16.7079)
8 tensor(16.0178)
9 tensor(16.3646)
10 tensor(16.6561)
11 tensor(14.5724)
12 tensor(16.7936)
13 tensor(16.3687)
14 tensor(17.6835)
15 tensor(13.8506)
16 tensor(19.1857)
17 tensor(16.6285)
18 tensor(15.0798)
19 tensor(18.5472)
20 tensor(15.9413)
21 tensor(15.4341)
22 tensor(15.2496)
23 tensor(16.1558)
24 tensor(14.4413)
25 tensor(12.2623)
26 tensor(18.1428)
27 tensor(13.3566)
28 tensor(17.5096)
29 tensor(17.8404)
30 tensor(15.2039)
31 tensor(15.7762)
32 tensor(14.3392)
33 tensor(12.4270)
34 tensor(14.3555)
35 tensor(13.6888)
36 tensor(14.4935)
37 tensor(15.1042)
38 tensor(13.3682)
39 tensor(14.0967)
40 tensor(15.8743)
41 tensor(11.7262)
42 tensor(14.9002)
43 tensor(16.3596)
44 tensor(12.4668)
45 tensor(11.7021)
46 tensor(14.5181)
47 tensor(12.7879)
48 tensor(14.1457)
49 tensor(12.3710)
50 tensor(13.9647)
51 tensor(15.3177)
52 tensor(13.2340)
53 

In [None]:
def build_dataset(words, block_size):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  # print(X.shape, Y.shape)
  return X, Y

def build_loos(C, X, Y, W1, b1, W2, b2, print_label=""):
    emb = C[X]
    h = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(print_label, loss.data)
    return emb, h, logits, loss

# Made it clean
block_size = 3  
batch_size = 1000
emd_dim = 10
hidden_dim = 300
learning_rate = 1e-1
learning_steps = 30000

# splits of training, dev and test
# 80%, 10%, 10%
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1], block_size=block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size=block_size)
Xte, Yte = build_dataset(words[n2:], block_size=block_size)

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, emd_dim), generator=g)
W1 = torch.randn((block_size * emd_dim, hidden_dim), generator=g)
b1 = torch.randn(hidden_dim, generator=g)
W2 = torch.randn((hidden_dim, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

for i in range(learning_steps):

    # minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # forward pass
    _, _, _, loss = build_loos(C, Xtr[ix], Ytr[ix], W1, b1, W2, b2, print_label=f"step {i}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -learning_rate * p.grad

# last forward pass
emb, h, logits, loss = build_loos(C, Xdev, Ydev, W1, b1, W2, b2, print_label="dev")


step 0 tensor(31.3937)
step 1 tensor(29.0813)
step 2 tensor(27.3901)
step 3 tensor(25.9449)
step 4 tensor(25.0752)
step 5 tensor(22.9265)
step 6 tensor(23.6179)
step 7 tensor(22.5498)
step 8 tensor(21.5585)
step 9 tensor(20.4806)
step 10 tensor(19.9518)
step 11 tensor(20.7376)
step 12 tensor(18.6339)
step 13 tensor(18.0924)
step 14 tensor(17.9407)
step 15 tensor(16.7314)
step 16 tensor(16.5513)
step 17 tensor(16.5904)
step 18 tensor(16.0858)
step 19 tensor(15.4034)
step 20 tensor(15.0945)
step 21 tensor(16.3174)
step 22 tensor(15.5379)
step 23 tensor(14.6358)
step 24 tensor(14.6863)
step 25 tensor(14.2877)
step 26 tensor(14.8119)
step 27 tensor(14.2823)
step 28 tensor(13.9353)
step 29 tensor(13.6859)
step 30 tensor(13.3537)
step 31 tensor(13.9935)
step 32 tensor(13.8197)
step 33 tensor(12.0490)
step 34 tensor(13.1498)
step 35 tensor(12.0812)
step 36 tensor(12.0853)
step 37 tensor(11.8709)
step 38 tensor(11.8508)
step 39 tensor(11.3863)
step 40 tensor(11.9641)
step 41 tensor(11.3630)
st

In [130]:
print(Xdev.shape)
print(emb.shape)
print(emb.view(emb.shape[0], -1).shape)
print(W1.shape)
print(W2.shape)
print(logits.shape)
print(probs.shape)

torch.Size([22711, 3])
torch.Size([22711, 3, 10])
torch.Size([22711, 30])
torch.Size([30, 300])
torch.Size([300, 27])
torch.Size([22711, 27])
torch.Size([22711, 27])


In [136]:
# Sampling from neural network model
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    context = [0] * block_size
    out = []
    while True:
        
        # Get current context
        emb = C[torch.tensor([context])] # (1,block_size,d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)

        # Sample
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break

        # Update context
        context = context[1:] + [ix]

    print("".join(out))

cer.
mariah.
makila.
kayde.
malima.
tain.
lucan.
katha.
samiyau.
javhrigotam.
