# PART1: Manual Allocation

### Reading and exploring the dataset

In [1]:
words = open('../resources/names.txt', 'r', encoding='utf-8').read().splitlines()
len(words)

32033

### Exploring and counting the bigrams in the dataset

In [2]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs,chs[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram,0) + 1

In [3]:
sorted(b.items(), key = lambda kv:-kv[1])

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963),
 (('l', 'e'), 2921),
 (('e', 'n'), 2675),
 (('l', 'a'), 2623),
 (('m', 'a'), 2590),
 (('<S>', 'm'), 2538),
 (('a', 'l'), 2528),
 (('i', '<E>'), 2489),
 (('l', 'i'), 2480),
 (('i', 'a'), 2445),
 (('<S>', 'j'), 2422),
 (('o', 'n'), 2411),
 (('h', '<E>'), 2409),
 (('r', 'a'), 2356),
 (('a', 'h'), 2332),
 (('h', 'a'), 2244),
 (('y', 'a'), 2143),
 (('i', 'n'), 2126),
 (('<S>', 's'), 2055),
 (('a', 'y'), 2050),
 (('y', '<E>'), 2007),
 (('e', 'r'), 1958),
 (('n', 'n'), 1906),
 (('y', 'n'), 1826),
 (('k', 'a'), 1731),
 (('n', 'i'), 1725),
 (('r', 'e'), 1697),
 (('<S>', 'd'), 1690),
 (('i', 'e'), 1653),
 (('a', 'i'), 1650),
 (('<S>', 'r'), 1639),
 (('a', 'm'), 1634),
 (('l', 'y'), 1588),
 (('<S>', 'l'), 1572),
 (('<S>', 'c'), 1542),
 (('<S>', 'e'), 1531),
 (('j', 'a'), 1473),
 (

In [4]:
# a list of alphabet
chars = sorted(list(set(''.join(words))))
LANG_SIZE = len(chars)+1
LANG_SIZE

27

### Counting bigrams in a 2D torch tensor ("training the model")

In [5]:
import torch
N = torch.zeros((LANG_SIZE,LANG_SIZE), dtype=torch.int32)

In [6]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [7]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        
        N[ix1,ix2] +=1

### Visualizing the bigram tensor

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(LANG_SIZE):
    for j in range(LANG_SIZE):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

### Efficiency! vectorized normalization of the rows, tensor broadcasting

In [8]:
# first row
p = N[0,:].float()
p = p / p.sum()
p

tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

In [9]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p,num_samples=1, replacement=True, generator=g).item()

itos[ix]

'j'

In [10]:
#Andrej Edition

P = (N+1).float() # N+1 -> Model Smothing
P /= P.sum(1,keepdim=True)

In [11]:
print(P.shape)
print(P.sum(1,keepdim=True).shape)
# 27, 27
# 27, 1

torch.Size([27, 27])
torch.Size([27, 1])


In [12]:
P[0].sum()

tensor(1.)

### Sampling from the model

In [13]:
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  ix = 0
  out = []
  while True:
    p = P[ix]

    ix = torch.multinomial(p,num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

junide.
janasah.
p.
cony.
a.
nn.
kohin.
tolian.
juee.
ksahnaauranilevias.


### Loss function (the negative log likelihood of the data under our model)

In [14]:
log_likelihood = 0
n = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        
        prob = P[ix1,ix2]
        logprob = torch.log(prob)

        log_likelihood += logprob
        n += 1
        # print(f'{ch1}{ch2}: {prob:.4f}-> {logprob:.4f}')

nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

nll=tensor(559951.5625)
2.4543561935424805


# PART 2: ANN

### Creating the bigram dataset for the neural net

In [15]:
 #crate the training set of bigrams

xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)


In [16]:
xs

tensor([ 0,  5, 13, 13,  1])

In [17]:
ys

tensor([ 5, 13, 13,  1,  0])

### Feeding integers into neural nets? one-hot encodings

In [18]:
import torch.nn.functional as F

xenc = F.one_hot(xs,num_classes=LANG_SIZE).float()
xenc

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
import matplotlib.pyplot as plt
plt.imshow(xenc)

<matplotlib.image.AxesImage at 0x1e38244bf90>

### The "neural net": one linear layer of neurons implemented with matrix multiplication

In [19]:
W = torch.randn((LANG_SIZE,LANG_SIZE))
(xenc @ W)[3,13]
# (5,27) @ (27 ,27) -> (5, 27)

tensor(-0.3582)

In [20]:
(xenc[3] * W[:,13]).sum()

tensor(-0.3582)

In [21]:
xenc @ W

tensor([[-0.1320,  0.7659,  0.7271, -0.6136,  1.6699, -1.1053, -0.5580,  0.0158,
         -0.6062, -0.3438, -1.1515, -0.9943, -0.0936, -0.8486, -1.5511, -0.0958,
         -1.4812,  0.2592, -0.9420,  1.2620,  1.5844,  0.5304, -1.8027, -0.8347,
         -0.0798, -0.9612, -2.6211],
        [-0.8520, -1.3428, -0.7346,  1.8748,  0.5972,  0.2344,  0.4577,  1.4824,
          0.7135,  0.8814, -0.6732,  1.2794,  0.3847, -0.8054, -1.4857, -0.1758,
         -0.2293,  0.9962,  0.3420, -0.6015, -0.6228, -0.2651,  0.3583,  0.2010,
          0.8959,  0.2023,  0.2552],
        [ 0.0692, -0.7265, -1.7623, -2.0510,  1.4543,  0.6123, -1.9616,  0.4388,
         -0.2646,  0.7914, -0.9430,  1.6699, -0.3310, -0.3582,  0.7033, -0.0885,
         -0.9250, -0.7624,  0.2985, -1.8251, -1.1501,  1.1395, -1.2706,  0.1079,
          0.9409, -1.4204, -1.3623],
        [ 0.0692, -0.7265, -1.7623, -2.0510,  1.4543,  0.6123, -1.9616,  0.4388,
         -0.2646,  0.7914, -0.9430,  1.6699, -0.3310, -0.3582,  0.7033, -0.0885

### transforming neural net outputs into probabilities: the softmax

In [22]:
logits = (xenc @ W) #log counts
counts = logits.exp() #equivalent N
prob = counts /counts.sum(1, keepdims=True)
prob 

tensor([[0.0280, 0.0688, 0.0662, 0.0173, 0.1699, 0.0106, 0.0183, 0.0325, 0.0174,
         0.0227, 0.0101, 0.0118, 0.0291, 0.0137, 0.0068, 0.0291, 0.0073, 0.0414,
         0.0125, 0.1130, 0.1560, 0.0544, 0.0053, 0.0139, 0.0295, 0.0122, 0.0023],
        [0.0100, 0.0061, 0.0112, 0.1528, 0.0426, 0.0296, 0.0370, 0.1032, 0.0478,
         0.0566, 0.0120, 0.0842, 0.0344, 0.0105, 0.0053, 0.0197, 0.0186, 0.0635,
         0.0330, 0.0128, 0.0126, 0.0180, 0.0335, 0.0287, 0.0574, 0.0287, 0.0302],
        [0.0325, 0.0147, 0.0052, 0.0039, 0.1299, 0.0560, 0.0043, 0.0470, 0.0233,
         0.0669, 0.0118, 0.1611, 0.0218, 0.0212, 0.0613, 0.0278, 0.0120, 0.0142,
         0.0409, 0.0049, 0.0096, 0.0948, 0.0085, 0.0338, 0.0777, 0.0073, 0.0078],
        [0.0325, 0.0147, 0.0052, 0.0039, 0.1299, 0.0560, 0.0043, 0.0470, 0.0233,
         0.0669, 0.0118, 0.1611, 0.0218, 0.0212, 0.0613, 0.0278, 0.0120, 0.0142,
         0.0409, 0.0049, 0.0096, 0.0948, 0.0085, 0.0338, 0.0777, 0.0073, 0.0078],
        [0.0362, 0.0176,

In [23]:
prob[0] # -> what should come after '.'

tensor([0.0280, 0.0688, 0.0662, 0.0173, 0.1699, 0.0106, 0.0183, 0.0325, 0.0174,
        0.0227, 0.0101, 0.0118, 0.0291, 0.0137, 0.0068, 0.0291, 0.0073, 0.0414,
        0.0125, 0.1130, 0.1560, 0.0544, 0.0053, 0.0139, 0.0295, 0.0122, 0.0023])

### Summary, Preview to next steps, reference to Micrograd

In [24]:
xs

tensor([ 0,  5, 13, 13,  1])

In [25]:
ys

tensor([ 5, 13, 13,  1,  0])

In [26]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((LANG_SIZE, LANG_SIZE), generator=g)

In [27]:
xenc = F.one_hot(xs, num_classes=LANG_SIZE).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # (fake) counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [28]:

nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  # print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('==================')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

--------
bigram example 1: .e (indexes 0,5)
input to the neural net: 0
label (actual next character): 5
probability assigned by the net to the the correct character: 0.01228625513613224
log likelihood: -4.399273872375488
negative log likelihood: 4.399273872375488
--------
bigram example 2: em (indexes 5,13)
input to the neural net: 5
label (actual next character): 13
probability assigned by the net to the the correct character: 0.018050700426101685
log likelihood: -4.014570713043213
negative log likelihood: 4.014570713043213
--------
bigram example 3: mm (indexes 13,13)
input to the neural net: 13
label (actual next character): 13
probability assigned by the net to the the correct character: 0.026691533625125885
log likelihood: -3.623408794403076
negative log likelihood: 3.623408794403076
--------
bigram example 4: ma (indexes 13,1)
input to the neural net: 13
label (actual next character): 1
probability assigned by the net to the the correct character: 0.07367686182260513
log likeliho

### Vectorized loss

In [29]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((LANG_SIZE, LANG_SIZE), generator=g, requires_grad=True)

In [30]:
xenc = F.one_hot(xs, num_classes=LANG_SIZE).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # (fake) counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character


In [31]:
loss = -probs[torch.arange(len(ys)), ys].log().mean()
loss.item()

3.7693049907684326

### Backward and update, in PyTorch

In [32]:
W.grad = None # set the gradiant to Zero
loss.backward()

In [33]:
print(f'{W.shape=}')
print(f'{W.grad.shape=}')

W.shape=torch.Size([27, 27])
W.grad.shape=torch.Size([27, 27])


In [34]:
W.data += -0.1 * W.grad

# Putting Everything Together

In [35]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((LANG_SIZE,LANG_SIZE), generator=g, requires_grad=True)

number of examples:  228146


In [36]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=LANG_SIZE).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  if k%10==0:
    print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

3.768618583679199
2.696505308151245
2.5822560787200928
2.5413522720336914
2.52126407623291
2.509855031967163
2.5027060508728027
2.4978787899017334
2.494438886642456
2.4918932914733887


# Sampling from the Neural Net

In [37]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=LANG_SIZE).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

junide.
janasah.
p.
cfay.
a.
nn.
kohin.
tolian.
juwe.
kalanaauranilevias.
