# PART1: Manual Allocation

### Reading and exploring the dataset

In [1]:
words = open('names.txt', 'r').read().splitlines()
len(words)

FileNotFoundError: ignored

### Exploring and counting the bigrams in the dataset

In [None]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs,chs[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram,0) + 1

In [None]:
sorted(b.items(), key = lambda kv:-kv[1])

### Counting bigrams in a 2D torch tensor ("training the model")

In [None]:
import torch
N = torch.zeros((27,27), dtype=torch.int32)

In [None]:
# a list of eng alphabet
chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        
        N[ix1,ix2] +=1

### Visualizing the bigram tensor

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

### Efficiency! vectorized normalization of the rows, tensor broadcasting

In [None]:
# first row
p = N[0,:].float()
p = p / p.sum()
p

In [None]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p,num_samples=1, replacement=True, generator=g).item()

itos[ix]

In [None]:
#Andrej Edition

P = (N+1).float() # N+1 -> Model Smothing
P /= P.sum(1,keepdim=True)

In [None]:
print(P.shape)
print(P.sum(1,keepdim=True).shape)
# 27, 27
# 27, 1

In [None]:
P[0].sum()

### Sampling from the model

In [None]:
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  ix = 0
  out = []
  while True:
    p = P[ix]

    ix = torch.multinomial(p,num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

### Loss function (the negative log likelihood of the data under our model)

In [None]:
log_likelihood = 0
n = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        
        prob = P[ix1,ix2]
        logprob = torch.log(prob)

        log_likelihood += logprob
        n += 1
        # print(f'{ch1}{ch2}: {prob:.4f}-> {logprob:.4f}')

nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

# PART 2: ANN

### Creating the bigram dataset for the neural net

In [None]:
 #crate the training set of bigrams

xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)


In [None]:
xs

In [None]:
ys

### Feeding integers into neural nets? one-hot encodings

In [None]:
import torch.nn.functional as F

xenc = F.one_hot(xs,num_classes=27).float()
xenc

In [None]:
plt.imshow(xenc)

### The "neural net": one linear layer of neurons implemented with matrix multiplication

In [None]:
W = torch.randn((27,27))
(xenc @ W)[3,13]
# (5,27) @ (27 ,27) -> (5, 27)

In [None]:
(xenc[3] * W[:,13]).sum()

In [None]:
xenc @ W

### transforming neural net outputs into probabilities: the softmax

In [None]:
logits = (xenc @ W) #log counts
counts = logits.exp() #equivalent N
prob = counts /counts.sum(1, keepdims=True)
prob 

In [None]:
prob[0] # -> what should come after '.'

### Summary, Preview to next steps, reference to Micrograd

In [None]:
xs

In [None]:
ys

In [None]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g)

In [None]:
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # (fake) counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [None]:

nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  # print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('==================')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())