In [40]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
words = open('names.txt', 'r').read().splitlines()

In [42]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [43]:
block_size = 3 # context length: How many characters do we take to predict the next one?
X, Y = [], [] # X is input to the neural network, Y is labels for each input X

for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
# Note at this point, x is a tensor matrix of size (num examples, block_size) 

Y = torch.tensor(Y) 

In [44]:
# In the relevant paper, 17,000 words are put in 30-dimensional space (vectors with 30 columns), for our 27
# character alphabet, lets try putting in just a 2-dimensional space to start.

C = torch.randn((27, 2))
C

tensor([[ 0.3369, -1.9743],
        [ 0.1685,  0.7811],
        [ 1.4413, -0.3017],
        [ 0.7968,  0.8246],
        [-0.1297, -0.4782],
        [-0.8366,  0.3041],
        [ 0.8747,  1.4674],
        [-0.1641, -2.1239],
        [-1.3326,  0.3777],
        [-0.6232,  0.1958],
        [ 1.2664, -0.0695],
        [-0.5747, -0.5108],
        [ 1.1087,  0.4455],
        [ 0.2797, -2.3744],
        [-0.0053,  2.1753],
        [ 0.0453,  1.8127],
        [ 0.3229,  0.5464],
        [-2.2139, -0.0745],
        [-1.3804,  1.5063],
        [ 0.0250, -0.4150],
        [ 0.0396,  0.6327],
        [ 0.5478,  0.2683],
        [ 0.9536, -0.2736],
        [ 1.5937,  0.6173],
        [-0.5793, -0.6395],
        [ 0.1158, -1.4203],
        [ 1.0105,  1.4419]])

In [45]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        ...,
        [26, 26, 25],
        [26, 25, 26],
        [25, 26, 24]])

In [46]:
emb = C[X]

In [47]:
# Implementing the hidden layer
W1 = torch.randn((6, 100)) # 6 inputs because we have 3 2 dimensional embeddings (3 characters of context per
#                            example, and 2 numbers per character)
b1 = torch.randn(100)

In [48]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # -1 is there so that Pytorch infers the correct dimension
# h is the hidden layer in the middle

In [49]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

# Outputs of neural network
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [51]:
prob[torch.arange(228146), Y] # This gives the current probabilities, as assigned by this neural network with this setting of its weights,
                          # to the correct characters in the sequence.

tensor([9.6890e-10, 1.5445e-04, 3.0590e-10,  ..., 4.5482e-05, 4.9534e-06,
        7.6416e-15])

In [53]:
loss = -prob[torch.arange(228146), Y].log().mean()
loss
# Because this is so common, PyTorch has a built in way of doing this:
loss = F.cross_entropy(logits, Y)
print(loss)

tensor(16.4376)


In [54]:
# Lets set up our neural net!
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [56]:
for p in parameters:
  p.requires_grad = True

for i in range(200000):
  # minibatch construct
  ix = torch.randint(0, X.shape[0], (32,))
  
  # forward pass
  emb = C[X[ix]] # (32, 3, 10)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Y[ix])
  # print(loss.item())
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  #lr = lrs[i]
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

print(loss.item())

2.3620858192443848
