# PART1: Manual Allocation

### Reading and exploring the dataset

In [1]:
words = open('../resources/Persian Names.txt', 'r', encoding='utf-8').read().splitlines()
len(words)

4055

### Exploring and counting the bigrams in the dataset

In [2]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs,chs[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram,0) + 1

In [3]:
sorted(b.items(), key = lambda kv:-kv[1])

[(('<S>', 'س'), 792),
 (('ي', 'د'), 729),
 (('ه', '<E>'), 723),
 (('س', 'ي'), 699),
 (('د', 'ا'), 617),
 (('<S>', 'م'), 609),
 (('ا', 'ل'), 605),
 (('ا', 'د'), 540),
 (('ه', ' '), 539),
 (('س', 'ا'), 500),
 (('ا', 'ن'), 496),
 (('ن', '<E>'), 484),
 (('ت', '<E>'), 474),
 (('ا', '<E>'), 464),
 (('ا', 'ت'), 422),
 (('ي', '<E>'), 389),
 (('ل', 'ي'), 337),
 (('ر', 'ا'), 331),
 (('<S>', 'ا'), 327),
 (('ي', 'ن'), 321),
 (('د', 'ه'), 320),
 ((' ', 'ا'), 317),
 (('ح', 'م'), 305),
 (('م', 'ح'), 292),
 (('ر', '<E>'), 281),
 (('ا', 'م'), 277),
 (('م', 'ي'), 273),
 (('م', 'د'), 268),
 (('د', '<E>'), 260),
 (('<S>', 'ع'), 257),
 (('م', '<E>'), 256),
 (('م', 'ه'), 245),
 (('<S>', 'ن'), 235),
 (('ل', 'ه'), 233),
 (('ن', 'ا'), 231),
 (('ع', 'ل'), 225),
 (('ي', 'ر'), 223),
 (('ب', 'ي'), 220),
 (('م', 'ا'), 219),
 (('ي', ' '), 212),
 (('ر', 'ي'), 211),
 (('د', 'ي'), 205),
 (('ا', 'ر'), 202),
 (('ي', 'ا'), 199),
 (('<S>', 'ف'), 185),
 ((' ', 'س'), 179),
 (('ي', 'ه'), 177),
 (('ل', 'س'), 168),
 (('<S>', 'ب

In [4]:
# a list of eng alphabet
chars = sorted(list(set(''.join(words))))
LANG_SIZE = len(chars)+1

### Counting bigrams in a 2D torch tensor ("training the model")

In [5]:
import torch
N = torch.zeros((LANG_SIZE,LANG_SIZE), dtype=torch.int32)

In [6]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [7]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        
        N[ix1,ix2] +=1

### Visualizing the bigram tensor

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(LANG_SIZE):
    for j in range(LANG_SIZE):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

### Efficiency! vectorized normalization of the rows, tensor broadcasting

In [8]:
# first row
p = N[0,:].float()
p = p / p.sum()
p

tensor([0.0000, 0.0000, 0.0000, 0.0237, 0.0000, 0.0000, 0.0000, 0.0806, 0.0407,
        0.0000, 0.0128, 0.0037, 0.0123, 0.0350, 0.0190, 0.0131, 0.0030, 0.0350,
        0.0168, 0.1953, 0.0338, 0.0195, 0.0017, 0.0116, 0.0005, 0.0634, 0.0081,
        0.0456, 0.0094, 0.0212, 0.0076, 0.1502, 0.0580, 0.0170, 0.0096, 0.0136,
        0.0170, 0.0017, 0.0010, 0.0185])

In [9]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p,num_samples=1, replacement=True, generator=g).item()

itos[ix]

'ن'

In [10]:
#Andrej Edition

P = (N+1).float() # N+1 -> Model Smothing
P /= P.sum(1,keepdim=True)

In [11]:
print(P.shape)
print(P.sum(1,keepdim=True).shape)
# 27, 27
# 27, 1

torch.Size([40, 40])
torch.Size([40, 1])


In [12]:
P[0].sum()

tensor(1.)

### Sampling from the model

In [13]:
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  ix = 0
  out = []
  while True:
    p = P[ix]

    ix = torch.multinomial(p,num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

ن.
طان.
ن.
م.
آراد.
صاداره.
دميه.
ه بيز.
شده بمه.
امي.


### Loss function (the negative log likelihood of the data under our model)

In [14]:
log_likelihood = 0
n = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        
        prob = P[ix1,ix2]
        logprob = torch.log(prob)

        log_likelihood += logprob
        n += 1
        # print(f'{ch1}{ch2}: {prob:.4f}-> {logprob:.4f}')

nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

nll=tensor(80917.7734)
2.491694450378418


# PART 2: ANN

### Creating the bigram dataset for the neural net

In [15]:
 #crate the training set of bigrams

xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)


In [16]:
xs

tensor([ 0, 27,  7, 23, 31, 33])

In [17]:
ys

tensor([27,  7, 23, 31, 33,  0])

### Feeding integers into neural nets? one-hot encodings

In [18]:
import torch.nn.functional as F

xenc = F.one_hot(xs,num_classes=LANG_SIZE).float()
xenc

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [None]:
import matplotlib.pyplot as plt
plt.imshow(xenc)

<matplotlib.image.AxesImage at 0x1e38244bf90>

### The "neural net": one linear layer of neurons implemented with matrix multiplication

In [20]:
W = torch.randn((LANG_SIZE,LANG_SIZE))
(xenc @ W)[3,13]
# (5,27) @ (27 ,27) -> (5, 27)

tensor(0.3814)

In [21]:
(xenc[3] * W[:,13]).sum()

tensor(0.3814)

In [22]:
xenc @ W

tensor([[ 9.1863e-01, -2.6280e-01,  1.9280e-01,  1.9076e+00, -1.8484e-01,
          1.8351e+00,  2.8744e-01, -3.0345e-01,  1.1859e+00,  1.4396e-01,
         -1.9351e-01,  1.9250e+00,  5.1559e-01, -1.1639e+00,  7.8952e-02,
          4.2444e-01,  2.5433e-03, -1.0926e+00,  9.6743e-01,  2.1447e-01,
          5.7677e-01, -1.7519e+00, -1.0232e+00,  1.8821e+00,  1.1104e+00,
          2.0274e+00,  1.1875e+00,  9.7035e-02,  2.4402e-01,  1.2993e+00,
         -1.3326e+00,  1.8801e-01, -9.1189e-01,  4.4453e-01, -7.3791e-01,
         -2.4837e-01,  4.0277e-01,  6.3645e-01,  1.5825e+00, -2.6408e+00],
        [ 6.8283e-01, -3.1976e+00,  2.2090e+00,  1.3580e-01, -1.1706e-01,
          7.0321e-01,  2.8715e-01, -1.1209e-01, -1.5281e-01,  2.5665e-01,
         -1.4520e-01,  1.5066e+00, -1.0189e+00,  6.3589e-01, -8.7110e-01,
         -3.8150e-01,  9.1894e-01,  1.6130e-01,  1.7747e+00, -1.0212e+00,
          1.6274e+00,  5.9338e-01,  5.2452e-01,  2.0195e+00,  1.0760e+00,
         -3.3299e-01,  1.0949e-01, -4

### transforming neural net outputs into probabilities: the softmax

In [23]:
logits = (xenc @ W) #log counts
counts = logits.exp() #equivalent N
prob = counts /counts.sum(1, keepdims=True)
prob 

tensor([[0.0298, 0.0091, 0.0144, 0.0801, 0.0099, 0.0745, 0.0158, 0.0088, 0.0389,
         0.0137, 0.0098, 0.0815, 0.0199, 0.0037, 0.0129, 0.0182, 0.0119, 0.0040,
         0.0313, 0.0147, 0.0212, 0.0021, 0.0043, 0.0780, 0.0361, 0.0902, 0.0390,
         0.0131, 0.0152, 0.0436, 0.0031, 0.0143, 0.0048, 0.0185, 0.0057, 0.0093,
         0.0178, 0.0225, 0.0578, 0.0008],
        [0.0270, 0.0006, 0.1244, 0.0156, 0.0121, 0.0276, 0.0182, 0.0122, 0.0117,
         0.0177, 0.0118, 0.0616, 0.0049, 0.0258, 0.0057, 0.0093, 0.0342, 0.0160,
         0.0806, 0.0049, 0.0695, 0.0247, 0.0231, 0.1029, 0.0401, 0.0098, 0.0152,
         0.0085, 0.0152, 0.0209, 0.0118, 0.0215, 0.0222, 0.0075, 0.0050, 0.0090,
         0.0384, 0.0156, 0.0133, 0.0035],
        [0.0007, 0.0166, 0.0066, 0.0031, 0.0206, 0.0461, 0.0009, 0.0129, 0.0056,
         0.0803, 0.0683, 0.0472, 0.0093, 0.0151, 0.0040, 0.0483, 0.0125, 0.0284,
         0.0234, 0.0249, 0.0242, 0.0044, 0.0004, 0.0495, 0.0320, 0.0066, 0.0362,
         0.0037, 0.1811, 

In [24]:
prob[0] # -> what should come after '.'

tensor([0.0298, 0.0091, 0.0144, 0.0801, 0.0099, 0.0745, 0.0158, 0.0088, 0.0389,
        0.0137, 0.0098, 0.0815, 0.0199, 0.0037, 0.0129, 0.0182, 0.0119, 0.0040,
        0.0313, 0.0147, 0.0212, 0.0021, 0.0043, 0.0780, 0.0361, 0.0902, 0.0390,
        0.0131, 0.0152, 0.0436, 0.0031, 0.0143, 0.0048, 0.0185, 0.0057, 0.0093,
        0.0178, 0.0225, 0.0578, 0.0008])

### Summary, Preview to next steps, reference to Micrograd

In [25]:
xs

tensor([ 0, 27,  7, 23, 31, 33])

In [26]:
ys

tensor([27,  7, 23, 31, 33,  0])

In [30]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((LANG_SIZE, LANG_SIZE), generator=g)

In [31]:
xenc = F.one_hot(xs, num_classes=LANG_SIZE).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # (fake) counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [32]:

nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  # print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('==================')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

--------
bigram example 1: .ف (indexes 0,27)
input to the neural net: 0
label (actual next character): 27
probability assigned by the net to the the correct character: 0.005299513228237629
log likelihood: -5.240140438079834
negative log likelihood: 5.240140438079834
--------
bigram example 2: فا (indexes 27,7)
input to the neural net: 27
label (actual next character): 7
probability assigned by the net to the the correct character: 0.004350846167653799
log likelihood: -5.437385082244873
negative log likelihood: 5.437385082244873
--------
bigram example 3: اط (indexes 7,23)
input to the neural net: 7
label (actual next character): 23
probability assigned by the net to the the correct character: 0.040493037551641464
log likelihood: -3.20662522315979
negative log likelihood: 3.20662522315979
--------
bigram example 4: طم (indexes 23,31)
input to the neural net: 23
label (actual next character): 31
probability assigned by the net to the the correct character: 0.026163851842284203
log likeli

### Vectorized loss

In [33]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((LANG_SIZE, LANG_SIZE), generator=g, requires_grad=True)

In [34]:
xenc = F.one_hot(xs, num_classes=LANG_SIZE).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # (fake) counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character


In [39]:
loss = -probs[torch.arange(len(ys)), ys].log().mean()
loss.item()

4.29724645614624

### Backward and update, in PyTorch

In [40]:
W.grad = None # set the gradiant to Zero
loss.backward()

In [41]:
print(f'{W.shape=}')
print(f'{W.grad.shape=}')

W.shape=torch.Size([40, 40])
W.grad.shape=torch.Size([40, 40])


In [42]:
W.data += -0.1 * W.grad

# Putting Everything Together

In [46]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((LANG_SIZE,LANG_SIZE), generator=g, requires_grad=True)

number of examples:  32475


In [50]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=LANG_SIZE).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  if k%10==0:
    print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

2.514431953430176
2.514009714126587
2.5136187076568604
2.5132553577423096
2.5129175186157227
2.5126023292541504
2.512308120727539
2.5120327472686768
2.511775016784668
2.51153302192688


# Sampling from the Neural Net

In [51]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=LANG_SIZE).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

ن.
طان.
ن.
م.
آراد.
صاداره.
دميه.
ه بيز.
شره بمه.
امي.
