In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [6]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [7]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:

  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [8]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [9]:
# C is the embedding vector
C = torch.randn((27, 2))
C.dtype

torch.float32

In [10]:
C[5]

tensor([-0.7560,  0.6801])

In [11]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-0.7560,  0.6801])

In [12]:
# how to embed all the 32, 3 input
C[X].shape

torch.Size([32, 3, 2])

In [13]:
C[1]

tensor([ 0.2064, -1.7771])

In [14]:
X[13, 2]

tensor(1)

In [15]:
C[X][13, 2]

tensor([ 0.2064, -1.7771])

In [16]:
# Embedding
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [17]:
# we have 2 dimensional embedding and 3 of them, so input to w2 is 6 and arbitrarily lets take 100 neurons
w1 = torch.randn((6, 100))
b1 = torch.randn((100, ))

# emb: [32, 3, 2], w1: [6, 100]
# how to transform the embedding to match the dimensions of w1 - concatenation
emb[:, 0, :].shape

torch.Size([32, 2])

In [18]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([32, 6])

But using cat is not scalable as we are directly indexing, so instead we use unbind.

In [19]:
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([32, 6])

In [20]:
emb.view(32, 6)

tensor([[-1.5031,  0.3500, -1.5031,  0.3500, -1.5031,  0.3500],
        [-1.5031,  0.3500, -1.5031,  0.3500, -0.7560,  0.6801],
        [-1.5031,  0.3500, -0.7560,  0.6801, -0.9531,  0.4204],
        [-0.7560,  0.6801, -0.9531,  0.4204, -0.9531,  0.4204],
        [-0.9531,  0.4204, -0.9531,  0.4204,  0.2064, -1.7771],
        [-1.5031,  0.3500, -1.5031,  0.3500, -1.5031,  0.3500],
        [-1.5031,  0.3500, -1.5031,  0.3500, -0.8420, -0.2216],
        [-1.5031,  0.3500, -0.8420, -0.2216, -0.8775, -0.5369],
        [-0.8420, -0.2216, -0.8775, -0.5369,  0.6727, -1.1821],
        [-0.8775, -0.5369,  0.6727, -1.1821, -2.0961,  0.2567],
        [ 0.6727, -1.1821, -2.0961,  0.2567,  0.6727, -1.1821],
        [-2.0961,  0.2567,  0.6727, -1.1821,  0.2064, -1.7771],
        [-1.5031,  0.3500, -1.5031,  0.3500, -1.5031,  0.3500],
        [-1.5031,  0.3500, -1.5031,  0.3500,  0.2064, -1.7771],
        [-1.5031,  0.3500,  0.2064, -1.7771, -2.0961,  0.2567],
        [ 0.2064, -1.7771, -2.0961,  0.2

In [21]:
emb.shape[0]

32

In [22]:
h = torch.tanh(emb.view(-1, 6) @ w1 + b1) # -1 is equivalent to emb.shape[0].

In [23]:
h

tensor([[-0.9999,  0.9648,  0.7827,  ...,  0.9994,  0.9998,  0.9891],
        [-0.9997,  0.9779,  0.4883,  ...,  0.9990,  0.9999,  0.9842],
        [-0.9992,  0.5573,  0.8894,  ...,  0.9997,  0.9998,  0.9941],
        ...,
        [-0.9959,  0.8495, -0.3119,  ...,  0.9974,  0.9965,  0.3274],
        [-0.9973, -0.9518, -0.2297,  ...,  0.9899,  0.8125,  0.1902],
        [-0.9367, -0.9988, -0.8933,  ..., -0.9684, -0.9785, -0.9010]])

In [24]:
w2 = torch.randn((100, 27))
b2 = torch.randn(27)

logits = h @ w2 + b2
logits.shape

torch.Size([32, 27])

In [25]:
counts = logits.exp()
prob = counts / (counts.sum(1, keepdims=True))
prob[0].sum()

tensor(1.)

In [26]:
prob[torch.arange(32), Y]

tensor([7.9524e-08, 1.6542e-09, 1.8354e-08, 6.7632e-05, 4.8770e-12, 1.9873e-08,
        1.3499e-11, 8.2946e-09, 1.5892e-07, 1.7344e-07, 3.5211e-06, 7.6723e-11,
        1.1308e-04, 1.3741e-06, 5.0086e-01, 4.3359e-11, 1.0816e-06, 2.4611e-09,
        9.9195e-01, 8.9437e-06, 4.8039e-08, 1.4496e-05, 6.2966e-12, 9.9932e-01,
        6.1804e-11, 6.1928e-08, 2.2616e-07, 6.4309e-07, 1.8849e-13, 5.0059e-08,
        5.5715e-01, 2.6736e-10])

In [27]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(15.7182)

In [28]:
l = F.cross_entropy(logits, Y) # much more efficient than writing our own code, as new tenors are not created in memory.
l

tensor(15.7182)

In [30]:
# train, test and validation split up
def build_dataset(words):
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [32]:
# 4-dimensional issue in starting loss being very high
logits = torch.tensor([0.0, 0.0, 5.0, 0.0])
probs = torch.softmax(logits, dim=0)
loss = -probs[2].log()

probs, loss

(tensor([0.0066, 0.0066, 0.9802, 0.0066]), tensor(0.0200))

In [33]:
vocab_size = len(itos)

# MLP revisited
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g)
b1 = torch.randn(n_hidden,                        generator=g)
W2 = torch.randn((n_hidden, vocab_size),          generator=g)
b2 = torch.randn(vocab_size,                      generator=g)

parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters))
for p in parameters:
  p.requires_grad = True

11897


In [34]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

  # forward pass
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
  # Linear layer
  hpreact = embcat @ W1 + b1 # hidden layer pre-activation
  # Non-linearity
  h = torch.tanh(hpreact) # hidden layer
  logits = h @ W2 + b2 # output layer
  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())


      0/ 200000: 26.5404
  10000/ 200000: 2.8297
  20000/ 200000: 2.5702
  30000/ 200000: 2.2293
  40000/ 200000: 2.3720
  50000/ 200000: 2.7074
  60000/ 200000: 2.3786
  70000/ 200000: 2.3498
  80000/ 200000: 2.1334
  90000/ 200000: 2.0449
 100000/ 200000: 2.3394
 110000/ 200000: 2.3778
 120000/ 200000: 2.2828
 130000/ 200000: 2.4198
 140000/ 200000: 2.2789
 150000/ 200000: 2.2704
 160000/ 200000: 2.1293
 170000/ 200000: 2.1585
 180000/ 200000: 2.4126
 190000/ 200000: 2.0491
