In [22]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [23]:
# reading the data
words = open("names.txt", "r").read().splitlines()

# Exploring
print(f"first 10 words{words[:10]}")
print(f"length of words: {len(words)}")
print(f"min word length {min(len(w) for (w) in words)} and max word length {max(len(w) for (w) in words)}")

first 10 words['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
length of words: 32033
min word length 2 and max word length 15


In [24]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

## E01: Train a Trigram Language Model
 Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [25]:
# make a list of characters (a -> z)
chars = sorted(list(set("".join(words))))
chars = ["."] + chars

# # make a dictionary of character to index
stoi = {ch: i for (i, ch) in enumerate(chars)}

# # make a dictionary of index to character
itos = {i: ch for (ch, i) in stoi.items()}

### 1 - Using Counting

In [26]:
N = torch.ones((27,27,27), dtype=torch.int32)

In [27]:
# 27 27 27
# 27 27 1

In [None]:
N[0,0,0] = 0
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1,ix2,ix3] += 1

P = N.float()
P /= P.sum(2, keepdim=True)

In [None]:
P[10,4].sum().item()

1.0

#### Loss Function

In [None]:
log_likelihood = 0.0
n = 0

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    prob = P[ix1, ix2, ix3]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    # print(f'{ch1}{ch2}{ch3}: {logprob:.4f}')

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

log_likelihood=tensor(-410414.9688)
nll=tensor(410414.9688)
2.092747449874878


#### Sampling

In [None]:
g = torch.Generator().manual_seed((2147483647))

names = []
for i in range(5):
  out = []
  ix1, ix2 = 0, 0
  while True:
    p = P[ix1, ix2]
    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() # this tell us what index will be next
    out.append(itos[ix2])
    if ix2 == 0:
      break
  names.append("".join(out))

names

['junide.', 'ilyasid.', 'prelay.', 'ocin.', 'fairritoper.']

### 2 - Using MLP

In [None]:
# Prepare the dataset
xs, ys = [], []
num = 0

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]

    xs.append([ix1, ix2])
    ys.append(ix3)

    num += 1

xs = torch.tensor(xs, dtype=torch.int64)
ys = torch.tensor(ys, dtype=torch.int64)

# initialize the network
g = torch.Generator().manual_seed(2147483647)
W1 = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [None]:
# gradient descent
rep = 100

for k in range(rep):

  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # this transforms the matrix from [N, 2, 27] -> [N, 27*2] | (N, 27*2) @ (27*2, 27) -> (N,   27)

  # softmax
  logits = xenc @ W1 # log-counts
  counts = logits.exp() # equivalent N
  probs = counts / counts.sum(1, keepdims=True)
  # probs.shape = [4, 27]. Isso significa que, para cada "trigram" (que são 4 no total), ele output probabilidades

  # loss
  loss = -probs[torch.arange(num), ys].log().mean()

  if k % 10 == 0:
      print(f"{k}: {loss.item():.4f}")


  # backward pass
  W1.grad = None # set to zero the gradient
  loss.backward()


  # update
  W1.data += -50 * W1.grad

  if k == rep-1:
      print(f"{k}: {loss.item():.4f}")


0: 4.1863
10: 2.5042
20: 2.3797
30: 2.3327
40: 2.3081
50: 2.2932
60: 2.2833
70: 2.2762
80: 2.2709
90: 2.2668
99: 2.2637


In [None]:
nlls = torch.zeros(4)
for i in range(4):
    # i-th trigram:
    x1 = xs[i][0].item()  # 1st input character index
    x2 = xs[i][1].item()  # 2nd input character index

    y = ys[i].item()  # label character index
    print('--------')
    print(f'trigram example {i+1}: {itos[x1]}{itos[x2]}{itos[y]} (indexes {x1},{x2},{y})')
    print('input to the neural net:', x1, x2)
    print('label (actual next character):', y)
    print('output probabilities from the neural net:', probs[i])
    p = probs[i, y]
    print('probability assigned by the net to the correct character:', p.item())
    logp = torch.log(p)
    print('log likelihood:', logp.item())
    nll = -logp
    print('negative log likelihood:', nll.item())
    nlls[i] = nll

print('========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())


--------
trigram example 1: .em (indexes 0,5,13)
input to the neural net: 0 5
label (actual next character): 13
output probabilities from the neural net: tensor([2.2573e-03, 7.4474e-02, 6.3476e-03, 3.8791e-03, 3.1316e-02, 6.3107e-02,
        1.3237e-03, 1.7223e-03, 1.0195e-02, 3.6011e-02, 1.5039e-03, 4.0430e-03,
        2.4968e-01, 6.8560e-02, 6.9603e-02, 2.6674e-02, 1.2084e-03, 2.1775e-04,
        1.6794e-01, 3.8515e-02, 1.6396e-02, 1.3999e-02, 4.5653e-02, 1.8738e-03,
        1.3072e-03, 4.6026e-02, 1.6177e-02], grad_fn=<SelectBackward0>)
probability assigned by the net to the correct character: 0.0685596615076065
log likelihood: -2.680050849914551
negative log likelihood: 2.680050849914551
--------
trigram example 2: emm (indexes 5,13,13)
input to the neural net: 5 13
label (actual next character): 13
output probabilities from the neural net: tensor([0.2328, 0.2555, 0.0166, 0.0109, 0.0055, 0.0825, 0.0025, 0.0069, 0.0056,
        0.1645, 0.0046, 0.0059, 0.0066, 0.0475, 0.0023, 0.0478,

#### Sampling

In [None]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix1, ix2 = 0, 0
    while True:
        # ---------------
        # # BEFORE:
        # # p = P[ix]
        # ---------------
        # NOW:
        xenc = F.one_hot(torch.tensor([ix1,ix2]), num_classes=27).float()
        xenc = xenc.view(-1, 27*2)
        logits = xenc @ W1  # predict logCounts
        counts = logits.exp()  # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True)  # probabilities for next character
        # ---------------

        ix1 = ix2
        ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break
    print(''.join(out))

aunide.
aliasad.
ushfay.
ainn.
aui.


## E02: Split up the dataset randomly into 80% train set, 10% dev set, 10% test set.
Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [None]:
# build the dataset
def build_dataset(words):
    X, Y = [], []
    for w in words:
      chs = ['.'] + list(w) + ['.']
      for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
          ix1 = stoi[ch1]
          ix2 = stoi[ch2]
          ix3 = stoi[ch3]

          X.append([ix1, ix2])
          Y.append(ix3)

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([156999, 2]) torch.Size([156999])
torch.Size([19452, 2]) torch.Size([19452])
torch.Size([19662, 2]) torch.Size([19662])


In [None]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [None]:
# gradient descent
rep = 100

for k in range(rep):

  # forward pass
  xenc = F.one_hot(Xtr, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # this transforms the matrix from [N, 2, 27] -> [N, 27*2] | (N, 27*2) @ (27*2, 27) -> (N,   27)

  # softmax
  logits = xenc @ W # log-counts
  counts = logits.exp() # equivalent N
  probs = counts / counts.sum(1, keepdims=True)
  # probs.shape = [4, 27]. Isso significa que, para cada "trigram" (que são 4 no total), ele output probabilidades

  # loss
  loss = -probs[torch.arange(len(Xtr)), Ytr].log().mean()

  if k % 10 == 0:
      print(f"{k}: {loss.item():.4f}")


  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()


  # update
  W.data += -50 * W.grad

  if k == rep-1:
      print(f"{k}: {loss.item():.4f}")

0: 4.1860
10: 2.5047
20: 2.3801
30: 2.3331
40: 2.3085
50: 2.2936
60: 2.2837
70: 2.2766
80: 2.2713
90: 2.2671
99: 2.2640


In [None]:
loss.item()

2.264030694961548

In [None]:
def MLP_loss(x,y,W):
  xenc = F.one_hot(x, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)

  # softmax
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdim=True)

  # Regularization / Model Smoothing
  reg = (W**2).mean()
  reg_str = 0.01

  # loss
  loss = -probs[torch.arange(len(x)), y].log().mean() + reg_str * reg

  return loss

In [None]:
# Evaluating the loss

print(f'Previous loss: {MLP_loss(xs, ys, W1):.4f}')
print(f'Training loss: {MLP_loss(Xtr, Ytr, W):.4f}')
print(f'Dev loss: {MLP_loss(Xdev, Ydev, W):.4f}')
print(f'Testing loss: {MLP_loss(Xte, Yte, W):.4f}')

Previous loss: 2.2727
Training loss: 2.2730
Dev loss: 2.2704
Testing loss: 2.2763


## E03: Use the dev set
Use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [None]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [None]:
  # Regularization / Model Smoothing
  reg_str = -0.0

In [None]:
# gradient descent
rep = 200

for k in range(rep):

  # forward pass
  xenc = F.one_hot(Xtr, num_classes=27).float()
  xenc = xenc.view(-1, 27*2) # this transforms the matrix from [N, 2, 27] -> [N, 27*2] | (N, 27*2) @ (27*2, 27) -> (N,   27)

  # softmax
  logits = xenc @ W # log-counts
  counts = logits.exp() # equivalent N
  probs = counts / counts.sum(1, keepdims=True)
  # probs.shape = [4, 27]. Isso significa que, para cada "trigram" (que são 4 no total), ele output probabilidades

  # Regularization / Model Smoothing
  reg = (W**2).mean()

  # loss
  loss = -probs[torch.arange(len(Xtr)), Ytr].log().mean() + reg_str * reg

  if k % 10 == 0:
      print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(Xdev, Ydev, W):.4f}")

  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()


  # update
  W.data += -50 * W.grad

  if k == rep-1:
      print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(Xdev, Ydev, W):.4f}")

0: Train Loss: 4.1860 | Dev Loss 4.2004
10: Train Loss: 2.5047 | Dev Loss 2.5072
20: Train Loss: 2.3801 | Dev Loss 2.3840
30: Train Loss: 2.3331 | Dev Loss 2.3382
40: Train Loss: 2.3085 | Dev Loss 2.3141
50: Train Loss: 2.2936 | Dev Loss 2.2995
60: Train Loss: 2.2837 | Dev Loss 2.2897
70: Train Loss: 2.2766 | Dev Loss 2.2827
80: Train Loss: 2.2713 | Dev Loss 2.2776
90: Train Loss: 2.2671 | Dev Loss 2.2736
100: Train Loss: 2.2637 | Dev Loss 2.2704
110: Train Loss: 2.2610 | Dev Loss 2.2679
120: Train Loss: 2.2587 | Dev Loss 2.2659
130: Train Loss: 2.2567 | Dev Loss 2.2641
140: Train Loss: 2.2551 | Dev Loss 2.2627
150: Train Loss: 2.2536 | Dev Loss 2.2615
160: Train Loss: 2.2524 | Dev Loss 2.2605
170: Train Loss: 2.2513 | Dev Loss 2.2596
180: Train Loss: 2.2503 | Dev Loss 2.2589
190: Train Loss: 2.2494 | Dev Loss 2.2582
199: Train Loss: 2.2487 | Dev Loss 2.2577


In [None]:
def MLP_loss(x,y,W):
  xenc = F.one_hot(x, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)

  # softmax
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdim=True)

  # Regularization / Model Smoothing
  reg = (W**2).mean()

  # loss
  loss = -probs[torch.arange(len(x)), y].log().mean() + reg_str * reg

  return loss

In [None]:
# Data Tracking:

# Why is a negative regularization strength reducing the loss always? A bit counterintuitive
# str = -0.10 | dev = 1.6252 | test = 1.6345
# str = -0.01 | dev = 2.2383 | test = 2.2447
# str = +0.00 | dev = 2.2511 | test = 2.2573
# str = +0.01 | dev = 2.2618
# str = +0.10 | dev = 2.3214
# str = +1.00 | dev = 2.5450

In [None]:
# Final evaluation of the loss

print(f'Testing loss: {MLP_loss(Xte, Yte, W):.4f}')

Testing loss: 2.2530


#### Sampling

In [None]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix1, ix2 = 0, 0
    while True:
        # ---------------
        # # BEFORE:
        # # p = P[ix]
        # ---------------
        # NOW:
        xenc = F.one_hot(torch.tensor([ix1,ix2]), num_classes=27).float()
        xenc = xenc.view(-1, 27*2)
        logits = xenc @ W  # predict logCounts
        counts = logits.exp()  # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True)  # probabilities for next character
        # ---------------

        ix1 = ix2
        ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break
    print(''.join(out))

aunide.
aliasad.
ushfay.
ainn.
aui.


## E04: Rewrite the MLP model without creating one hot vectors
We saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [None]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [None]:
# gradient descent
rep = 200

for k in range(rep):

  # forward pass


    # ---------------
    # # BEFORE:
    # logits = xenc @ W # log-counts
    # xenc = F.one_hot(Xtr, num_classes=27).float()
    # xenc = xenc.view(-1, 27*2) # this transforms the matrix from [N, 2, 27] -> [N, 27*2] | (N, 27*2) @ (27*2, 27) -> (N,   27)
    # ---------------
    # NOW:

  # softmax
  logits = W[Xtr[:,0]] + W[Xtr[:,1] + 27]
  counts = logits.exp() # equivalent N
  probs = counts / counts.sum(1, keepdims=True)
  # probs.shape = [4, 27]. Isso significa que, para cada "trigram" (que são 4 no total), ele output probabilidades

  # Regularization / Model Smoothing
  reg = (W**2).mean()

  # loss
  loss = -probs[torch.arange(len(Xtr)), Ytr].log().mean() + reg_str * reg


  #if k % 10 == 0:
      #print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(Xdev, Ydev, W):.4f}")

  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()


  # update
  W.data += -50 * W.grad

  if k == rep-1:
      print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(Xdev, Ydev, W):.4f}")

199: Train Loss: 2.2487 | Dev Loss 2.2466


## E05: Look up and use F.cross_entropy instead
Look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [20]:
# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [21]:
# gradient descent
rep = 200

for k in range(rep):

  # forward pass


    # ---------------
    # # BEFORE:
    # logits = xenc @ W # log-counts
    # xenc = F.one_hot(Xtr, num_classes=27).float()
    # xenc = xenc.view(-1, 27*2) # this transforms the matrix from [N, 2, 27] -> [N, 27*2] | (N, 27*2) @ (27*2, 27) -> (N,   27)
    # ---------------
    # NOW:

  # softmax
  logits = W[Xtr[:,0]] + W[Xtr[:,1] + 27]
  # counts = logits.exp() # equivalent N
  # probs = counts / counts.sum(1, keepdims=True)
  # # probs.shape = [4, 27]. Isso significa que, para cada "trigram" (que são 4 no total), ele output probabilidades

  # # Regularization / Model Smoothing
  # reg = (W**2).mean()

  # # loss
  # loss = -probs[torch.arange(len(Xtr)), Ytr].log().mean() + reg_str * reg

  loss = F.cross_entropy(logits, Ytr)


  #if k % 10 == 0:
      #print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(Xdev, Ydev, W):.4f}")

  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()


  # update
  W.data += -50 * W.grad

  if k == rep-1:
      print(f"{k}: Train Loss: {loss.item():.4f} | Dev Loss {MLP_loss(Xdev, Ydev, W):.4f}")

199: Train Loss: 2.2482 | Dev Loss 2.2487
