In [82]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

In [83]:
words = open('names.txt', 'r').read().splitlines()
print(f"first 10 words{words[:10]}")
print(f"length of words: {len(words)}")
print(f"Min word length: {min(len(w) for w in words)}")
print(f"Max word length: {max(len(w) for w in words)}")

first 10 words['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
length of words: 32033
Min word length: 2
Max word length: 15


E01: Train a trigram language model

In [84]:
# Make a list of chars
chars = sorted(list(set(''.join(words))))
chars = ['.'] + chars

# Dictionnary char to index
stoi = {s:i for i,s in enumerate(chars)}
# Dictionnary index to char
itos = {i:s for s,i in stoi.items()}

1. Counting

In [85]:
N = torch.ones(27, 27, 27, dtype=torch.int32) # Ones for smoothing
N[0,0,0] = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1

P = N.float()
P /= P.sum(dim=2, keepdim=True)

In [86]:
def myloss(words):
  log_likelihood = 0.0
  n = 0

  for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
      ix1 = stoi[ch1]
      ix2 = stoi[ch2]
      ix3 = stoi[ch3]

      prob = P[ix1, ix2, ix3]
      logprob = torch.log(prob)
      log_likelihood += logprob
      n += 1

  print(f'{log_likelihood=}')
  nll = -log_likelihood
  print(f'{nll=}')
  print(f'{nll/n=}')

myloss(words)

log_likelihood=tensor(-410414.9688)
nll=tensor(410414.9688)
nll/n=tensor(2.0927)


In [87]:
# Sampling names
names = []
for i in range(10):
  out = []
  ix1, ix2 = 0, 0
  while True:
    p = P[ix1, ix2]
    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
    if ix2 == 0:
      break
    out.append(itos[ix2])
  names.append("".join(out))

print(names)
myloss(names)


['en', 'kelyfxmyasio', 'ni', 'ulacqrniseni', 'dachah', 'grakaidayantriasel', 'teonn', 'mwrne', 'ma', 'eyarya']
log_likelihood=tensor(-171.5460)
nll=tensor(171.5460)
nll/n=tensor(2.4507)


2. Neural Net

In [88]:
# Training set
xs, ys = [], []

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]

    xs.append([ix1, ix2])
    ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [89]:
W = torch.randn((27*2, 27), requires_grad=True)

for i in range(300):
  # Forward pass
  xenc = F.one_hot(xs, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # turning each pair into a single flat input vector
  # Probs is softmax
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(dim=1, keepdim=True)
  # Loss (normalized nll)
  loss= -probs[torch.arange(len(xs)), ys].log().mean() + 0.15*(W**2).mean() # with regularization
  if i % 20 == 0: print(f"{i}: {loss.item():.4f}")
  # Backward pass
  W.grad = None
  loss.backward()
  # Update
  W.data += -50 * W.grad


0: 4.5354
20: 2.4490
40: 2.3816
60: 2.3606
80: 2.3519
100: 2.3478
120: 2.3457
140: 2.3446
160: 2.3440
180: 2.3436
200: 2.3434
220: 2.3433
240: 2.3432
260: 2.3431
280: 2.3431


In [90]:
# Sampling names
names = []
for i in range(10):
  out = []
  ix1, ix2 = 0, 0
  while True:
    xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes=27).float()
    xenc = xenc.view(-1, 27*2)
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(dim=1, keepdim=True)
    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
    out.append(itos[ix2])
    if ix2 == 0:
      break
  names.append("".join(out))

print(names)
myloss(names)

['ra.', 'aitrettie.', 'zera.', 'xiven.', 'hzie.', 'ellen.', 'ya.', 'dylah.', 'masadacrad.', 'ia.']
log_likelihood=tensor(-142.7957)
nll=tensor(142.7957)
nll/n=tensor(2.4620)


E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set.

In [91]:
from sklearn.model_selection import train_test_split

words_train, words_test = train_test_split(words, test_size=0.2, random_state=42)
words_dev, words_test = train_test_split(words_test, test_size=0.5, random_state=42)

x_train, y_train, x_dev, y_dev, x_test, y_test = [], [], [], [], [], []
for wgroup in [words_train, words_dev, words_test]:
  xs, ys = [], []
  for w in wgroup:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
      ix1 = stoi[ch1]
      ix2 = stoi[ch2]
      ix3 = stoi[ch3]

      xs.append([ix1, ix2])
      ys.append(ix3)

  xs = torch.tensor(xs)
  ys = torch.tensor(ys)

  if wgroup == words_train:
    x_train, y_train = xs, ys
  elif wgroup == words_dev:
    x_dev, y_dev = xs, ys
  else:
    x_test, y_test = xs, ys


In [92]:
W = torch.randn((27*2, 27), requires_grad=True)

for i in range(300):
  # Forward pass
  xenc = F.one_hot(x_train, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # turning each pair into a single flat input vector
  # Probs is softmax
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(dim=1, keepdim=True)
  # Loss (normalized nll)
  loss= -probs[torch.arange(len(x_train)), y_train].log().mean() + 0.15*(W**2).mean() # with regularization
  if i % 20 == 0: print(f"{i}: {loss.item():.4f}")
  # Backward pass
  W.grad = None
  loss.backward()
  # Update
  W.data += -50 * W.grad

0: 4.4906
20: 2.4510
40: 2.3829
60: 2.3616
80: 2.3523
100: 2.3477
120: 2.3453
140: 2.3440
160: 2.3433
180: 2.3429
200: 2.3426
220: 2.3425
240: 2.3424
260: 2.3423
280: 2.3423


In [93]:
def NN_loss(x, y, W):
  xenc = F.one_hot(x, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)
  # Probs is softmax
  logits = xenc @ W
  counts = logits.exp()
  p = counts / counts.sum(dim=1, keepdim=True)
  # Loss = normalized(-nll)
  loss= -p[torch.arange(len(x)), y].log().mean()
  return loss


In [94]:
print(f"Train Loss: {NN_loss(x_train, y_train, W):.4f}")
print(f"Dev Loss: {NN_loss(x_dev, y_dev, W):.4f}")
print(f"Test Loss: {NN_loss(x_test, y_test, W):.4f}")

Train Loss: 2.2755
Dev Loss: 2.2743
Test Loss: 2.2891


E03: Use the dev set to tune the strength of smoothing (or regularization) for the trigram model

In [95]:
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

for i in range(300):
  # Forward pass
  xenc = F.one_hot(x_train, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # turning each pair into a single flat input vector
  # Probs is softmax
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(dim=1, keepdim=True)
  # Loss (normalized nll)
  loss= -probs[torch.arange(len(x_train)), y_train].log().mean()
  #loss+= 0.15*(W**2).mean() # with regularization
  if i % 20 == 0: print(f"{i}: Train Loss: {loss.item():.4f} | Dev Loss {NN_loss(x_dev, y_dev, W):.4f}")
  # Backward pass
  W.grad = None
  loss.backward()
  # Update
  W.data += -50 * W.grad

0: Train Loss: 4.2523 | Dev Loss 4.2462
20: Train Loss: 2.3865 | Dev Loss 2.3819
40: Train Loss: 2.3079 | Dev Loss 2.3053
60: Train Loss: 2.2824 | Dev Loss 2.2807
80: Train Loss: 2.2699 | Dev Loss 2.2686
100: Train Loss: 2.2624 | Dev Loss 2.2614
120: Train Loss: 2.2575 | Dev Loss 2.2567
140: Train Loss: 2.2540 | Dev Loss 2.2533
160: Train Loss: 2.2513 | Dev Loss 2.2509
180: Train Loss: 2.2493 | Dev Loss 2.2490
200: Train Loss: 2.2476 | Dev Loss 2.2475
220: Train Loss: 2.2463 | Dev Loss 2.2463
240: Train Loss: 2.2452 | Dev Loss 2.2453
260: Train Loss: 2.2442 | Dev Loss 2.2445
280: Train Loss: 2.2434 | Dev Loss 2.2438


Best results with no regularization.

E04: Delete our use of F.one_hot in favor of simply indexing into rows of W?

In [96]:
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

for i in range(300):
  # Forward pass
  # Previous: with one-hot encoding
  #xenc = F.one_hot(x_train, num_classes=27).float()
  #xenc = xenc.view(-1, 27*2) # turning each pair into a single flat input vector
  # Probs is softmax
  #logits = xenc @ W # predict log-counts
  # Now:
  # W is still a weight matrix of shape [54, 27] (2 * 27 inputs → 27 outputs)
  # W[xs[:,0]]: selects the corresponding row from the first half of W (rows 0 to 26)
  # W[xs[:,1] + 27]: adds 27 to the index, so it selects the second half of W (rows 27 to 53)
  logits = W[xs[:,0]] + W[xs[:,1] + 27]
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(dim=1, keepdim=True)
  # Loss (normalized nll)
  loss= -probs[torch.arange(len(xs)), ys].log().mean()
  #loss+= 0.15*(W**2).mean() # with regularization
  if i % 20 == 0: print(f"{i}: Train Loss: {loss.item():.4f} | Dev Loss {NN_loss(x_dev, y_dev, W):.4f}")
  # Backward pass
  W.grad = None
  loss.backward()
  # Update
  W.data += -50 * W.grad

0: Train Loss: 4.2571 | Dev Loss 4.2720
20: Train Loss: 2.3702 | Dev Loss 2.3782
40: Train Loss: 2.2985 | Dev Loss 2.3149
60: Train Loss: 2.2727 | Dev Loss 2.2935
80: Train Loss: 2.2596 | Dev Loss 2.2831
100: Train Loss: 2.2516 | Dev Loss 2.2771
120: Train Loss: 2.2462 | Dev Loss 2.2733
140: Train Loss: 2.2423 | Dev Loss 2.2707
160: Train Loss: 2.2394 | Dev Loss 2.2689
180: Train Loss: 2.2371 | Dev Loss 2.2676
200: Train Loss: 2.2353 | Dev Loss 2.2666
220: Train Loss: 2.2338 | Dev Loss 2.2658
240: Train Loss: 2.2326 | Dev Loss 2.2652
260: Train Loss: 2.2315 | Dev Loss 2.2647
280: Train Loss: 2.2306 | Dev Loss 2.2644


E05: Use F.cross_entropy instead.

Compute the cross entropy loss between input logits and target. Better: highly stable, built-in, clean, highly optimized

In [97]:
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

for i in range(300):
  # Forward pass
  xenc = F.one_hot(xs, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # turning each pair into a single flat input vector
  # Probs is softmax
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(dim=1, keepdim=True)
  # Loss (normalized nll)
  loss= torch.nn.functional.cross_entropy(logits, ys)
  loss+= 0.15*(W**2).mean() # with regularization
  if i % 20 == 0: print(f"{i}: Train Loss: {loss.item():.4f}")
  # Backward pass
  W.grad = None
  loss.backward()
  # Update
  W.data += -50 * W.grad

0: Train Loss: 4.2272
20: Train Loss: 2.4433
40: Train Loss: 2.3764
60: Train Loss: 2.3556
80: Train Loss: 2.3468
100: Train Loss: 2.3424
120: Train Loss: 2.3400
140: Train Loss: 2.3388
160: Train Loss: 2.3380
180: Train Loss: 2.3376
200: Train Loss: 2.3373
220: Train Loss: 2.3372
240: Train Loss: 2.3371
260: Train Loss: 2.3370
280: Train Loss: 2.3370
