###  makemore demo - YouTube follow along

[Video - The spelled-out intro to language modeling: building makemore](https://www.youtube.com/watch?v=PaCmpygFfXo&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=2)

In [1]:
# Setup - common to both approaches 
import torch

# data set: 32k first names
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))

# s to i lookup, setting `.` as 0 index in array and all others + 1
# we'll use `.` to mark the start and end of all words
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0

# i to s lookup
itos = {i: s for s, i in stoi.items()}

In [2]:
# Approach 1: non-neural network approach: count frequency of bigrams and store in `N`
#
# Create a 27x27 matrix with values all set to 0
N = torch.zeros((27, 27), dtype=torch.int32)

# Get the counts
for w in words:
  # use `.` to mark the start and end of all words
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    # integer index of this character in stoi 0-27
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N[ix1, ix2] += 1

# prepare probabilities, parameters of our bigram language model
# Apply "model smoothing" using `N + 1` instead of `N`. This prevents 0's in probability matrix P, which could lead to `infinity` for loss measurement.
P = (N + 1).float()
# 27, 27
# 27, 1  # This is "broadcastable" and it stretches the 1 into all 27 rows
# https://pytorch.org/docs/stable/notes/broadcasting.html?highlight=broadcasting

# Below uses `/=` to avoid creating new tensor, ie more efficient
P /= P.sum(1, keepdim=True)

g = torch.Generator().manual_seed(2147483647)

In [3]:
# Sample
for i in range(5):
  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    # Break with `.` is found, marking the end of the word
    if ix == 0:
      break

  print(''.join(out))


mor.
axx.
minaymoryles.
kondlaisah.
anchshizarie.


In [4]:
# Approach 2: neural network approach trained on bigrams

# for one hot encoding: `F.one_hot` below
import torch.nn.functional as F

#
# Dataset: 228K bigrams from the 32K example names
#
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146


In [7]:
# Gradient descent
for k in range(100):

  # forward pass
  # input to the network: one-hot encoding
  xenc = F.one_hot(xs, num_classes=27).float()
  logits = xenc @ W  # predict log-counts
  counts = logits.exp()  # counts, equivalent to N
  # probabilities for next character
  probs = counts / counts.sum(1, keepdims=True)
  # regularization loss: `0.01*(W**2).mean()` tries to make all W's 0
  # if `0.01` is higher it will be more uniform and not
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()
  print(loss.item())

  # backward pass
  W.grad = None  # set to zero the gradient
  loss.backward()

  # update
  W.data += -50 * W.grad

# Earlier we had 2.47 loss when we manually did the counts.
# So we'd like this neural network approach to become as "good", when measuring the loss.

2.4899590015411377
2.4897918701171875
2.489628314971924
2.489469528198242
2.489314079284668
2.4891624450683594
2.4890148639678955
2.48887038230896
2.488729476928711
2.4885923862457275
2.4884581565856934
2.4883270263671875
2.4881982803344727
2.4880735874176025
2.4879510402679443
2.4878318309783936
2.487715005874634
2.487600564956665
2.4874887466430664
2.487379550933838
2.4872729778289795
2.487168550491333
2.4870662689208984
2.486966133117676
2.4868686199188232
2.4867727756500244
2.4866793155670166
2.4865870475769043
2.486497640609741
2.4864094257354736
2.486323118209839
2.486238718032837
2.4861562252044678
2.4860751628875732
2.4859957695007324
2.4859180450439453
2.485841989517212
2.485767126083374
2.4856936931610107
2.485621929168701
2.4855518341064453
2.4854824542999268
2.485414743423462
2.4853484630584717
2.485283374786377
2.4852190017700195
2.485156536102295
2.4850947856903076
2.485034465789795
2.4849750995635986
2.484917163848877
2.4848599433898926
2.4848034381866455
2.4847488403320

In [8]:
# Sample from neural net model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):

  out = []
  ix = 0
  while True:

    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W  # predict log-counts
    counts = logits.exp()  # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character

    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

  # Gives the exact same output as the original P matrix

mor.
axx.
minaymoryles.
kondmaisah.
anchshizarie.


In [None]:
mor.
axx.
minaymoryles.
kondlaisah.
anchshizarie.
