## Makemore part 1: solved exercises

#### Ex1 - Making it a Trigram Model

In [2]:
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

In [3]:
def groupbytwo(str):
    list = []
    if len(str) != 0:
        ntimes = len(str) - 1
    while ntimes != 0:
        list.append(str[:2])
        str = str[1:]
        ntimes -= 1
    return list

chs = '..' + 'exemplo' + '.'
for ch1, ch2 in zip(groupbytwo(chs), chs[2:]):
    print(f'for {ch1}, target is {ch2}')

for .., target is e
for .e, target is x
for ex, target is e
for xe, target is m
for em, target is p
for mp, target is l
for pl, target is o
for lo, target is .


In [4]:
# create the dataset
import torch
import numpy as np

xs, ys = [], []
for w in words:
  chs = '..' + w + '.'
  for ch1, ch2 in zip(groupbytwo(chs), chs[2:]):
    ix1 = [stoi[ch1[0]],stoi[ch1[1]]]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = ys.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

number of examples:  228146


In [7]:
import torch.nn.functional as F

for k in range(2):
  
  # forward pass
  #xenc = F.one_hot(xs, num_classes=it+1).float() # input to the network: one-hot encoding
  xenc = torch.zeros(num,27*2)
  for line in range(num): xenc[line] = torch.from_numpy(np.concatenate([F.one_hot(xs[line][0], num_classes = 27).float(), F.one_hot(xs[line][1], num_classes = 27).float()],axis = 0)) 

  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50* W.grad

4.242241382598877
3.512132167816162


Sampling some results:

In [8]:
g = torch.Generator().manual_seed(123456)

for i in range(5):
  
  out = []
  ix = 0 #int corresponding to '.'
  previous = F.one_hot(torch.tensor([ix]), num_classes=27).float()
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = torch.from_numpy(np.concatenate([previous,F.one_hot(torch.tensor([ix]), num_classes=27).float()], axis=1))
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------

    previous = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if out[-1][-1] == '.':
      break
    
  print(''.join(out))

krklxqaltx.
bemkahodzik.
rkch.
ejaes.
rllonmcwx.


### Ex2 - Solve previous ex without hardcoding context size

In [14]:
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

In [15]:
def groupbyn(str,n):
    list = []
    if len(str) != 0:
        ntimes = len(str) - 1
    while ntimes != 0:
        list.append(str[:n])
        str = str[1:]
        ntimes -= 1
    return list

n = 3
chs = 'exemplo'

for i in range(n): chs = '.' + chs
chs = chs +'.'
for ch1, ch2 in zip(groupbyn(chs,n), chs[n:]):
    print(f'for {ch1}, target is {ch2}')

for ..., target is e
for ..e, target is x
for .ex, target is e
for exe, target is m
for xem, target is p
for emp, target is l
for mpl, target is o
for plo, target is .


In [16]:
# create the dataset
import torch
import numpy as np

n = 3 # context size

xs, ys = [], []
for w in words:

  chs = w
  for i in range(n): chs = '.' + chs
  chs = chs +'.'

  for ch1, ch2 in zip(groupbyn(chs,n), chs[n:]):
    ix1 = []
    for i in ch1: ix1.append(stoi[i])
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = ys.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*n, 27), generator=g, requires_grad=True)

number of examples:  228146


In [17]:
import torch.nn.functional as F

for k in range(10):
  
  # forward pass
  xenc = torch.zeros(num,27*n)
  
  for line in range(num): 
    toconcat = []
    for i in range(n): toconcat.append(F.one_hot(xs[line][i], num_classes = 27).float())
    xenc[line] = torch.from_numpy(np.concatenate(toconcat, axis = 0)) 

  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50* W.grad

4.522070407867432
4.238542556762695


In [19]:
# Sample some examples

g = torch.Generator().manual_seed(1234)

for i in range(10):
  
  out = []
  ix = 0 #int corresponding to '.'
  previous = []
  for i in range(n-1): previous.append(F.one_hot(torch.tensor([ix]), num_classes=27).float())

  while True:
    previous.append(F.one_hot(torch.tensor([ix]), num_classes=27).float())
    xenc = torch.from_numpy(np.concatenate(previous, axis=1))
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------

    previous.pop(0)
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if out[-1][-1] == '.':
      break
    
  print(''.join(out))

kelxlpzgdkdqhfjibihudqgayna.
eeein.
tre.
rryhovo.
ecschzxt.
eesrrewfryiadyckdckchucbch.
err.
eilgt.
irr.
eerkomnzcgmqza.
