<a href="https://colab.research.google.com/github/AIGeekProgrammer/MiscellaneousAI/blob/main/NLP/Bigram_using_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notebook: Bigram using NN<br>
Author: Szymon Manduk<br>
Date: Oct 31, 2022<br>
Description: implementing Bigram algorithm using single linear layer - based on the idea presented by A. Karpathy: https://youtu.be/PaCmpygFfXo<br>

In [None]:
import torch

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# Open a file with names, read into list removing newline 
# and adding character '.' for the begining and end of a word.
words = []
with open('/gdrive/My Drive/Test/names.txt', 'r') as f:
  for cnt, line in enumerate(f.readlines()):
    words.append('.' + line.rstrip('\n') + '.')
words[:5]

['.emma.', '.olivia.', '.ava.', '.isabella.', '.sophia.']

In [None]:
# Build a list of all unique letters ...
s = set()
for word in words:
  s.update(list(word))
letters = sorted(list(s))
letters[:15]

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']

In [None]:
# ... to be able to convert from a letter to a number and vice versa.
char2idx = {ch:i for i,ch in enumerate(letters)}
idx2char = {i:ch for i,ch in enumerate(letters)}
print(f'Ex: Index for f is {char2idx["f"]}')
print(f'Ex: Character for index 14 is {idx2char[14]}')
print(f'Ex: Character for index 0 is {idx2char[0]}')

Ex: Index for f is 6
Ex: Character for index 14 is n
Ex: Character for index 0 is .


In [None]:
# Now, we're ready to build data and label tensors let's start with the first word only.
x, y = [], []
for word in words[:1]:  
  for ch1, ch2 in zip(word, word[1:]):
    x.append(char2idx[ch1])
    y.append(char2idx[ch2])
X = torch.tensor(x, dtype=torch.int64) # int64 necessary to later use F.one_hot function
Y = torch.tensor(y, dtype=torch.int64) # int64 necessary to later use F.one_hot function
print(f'Length of the dataset: {len(X)}')
print(f'Shape of the dataset: {X.shape}')
print(f'Data X: {X}')
print(f'Labels Y: {Y}')

# We cannot feed a neural network with numerical values. We need to turn them into one-hot encoded version.
from torch.nn.functional import one_hot
X = one_hot(X, 27).float()
print(X.shape)
print(X)
print(X.dtype)
print(Y.shape)
print(Y)
print(Y.dtype)

Length of the dataset: 5
Shape of the dataset: torch.Size([5])
Data X: tensor([ 0,  5, 13, 13,  1])
Labels Y: tensor([ 5, 13, 13,  1,  0])
torch.Size([5, 27])
tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])
torch.float32
torch.Size([5])
tensor([ 5, 13, 13,  1,  0])
torch.int64


In [None]:
# Looks ok, execute on the full dataset.
x, y = [], []
for word in words:  
  for ch1, ch2 in zip(word, word[1:]):
    x.append(char2idx[ch1])
    y.append(char2idx[ch2])
X = torch.tensor(x, dtype=torch.int64) 
Y = torch.tensor(y, dtype=torch.int64) 
print(f'Length of the dataset: {len(X)}')
print(f'Shape of the dataset: {X.shape}')
print(f'Data X: {X}')
print(f'Labels Y: {Y}')

X = one_hot(X, 27).float()
print(X.shape, X.dtype)
print(Y.shape, Y.dtype)

Length of the dataset: 228146
Shape of the dataset: torch.Size([228146])
Data X: tensor([ 0,  5, 13,  ..., 25, 26, 24])
Labels Y: tensor([ 5, 13, 13,  ..., 26, 24,  0])
torch.Size([228146, 27]) torch.float32
torch.Size([228146]) torch.int64


In [None]:
# Initialize matrix W.
W = torch.randn((X.shape[1], X.shape[1]), requires_grad=True)
print(W.shape)
print(W[0:2])

torch.Size([27, 27])
tensor([[-0.4908,  1.0303, -0.2841, -0.8224, -0.3459, -1.2313, -1.2028,  0.2898,
          0.3738,  0.5329, -0.6490, -0.0796,  0.3531, -0.8806,  0.2581,  0.4152,
         -0.0859, -0.7320, -0.0279, -1.3311, -1.8962, -0.3533, -1.2447,  1.8507,
          1.2245,  0.3065, -1.2184],
        [ 1.1710, -0.2254, -1.7892,  0.2823,  0.6517, -0.3705, -0.5825, -0.1892,
          1.0506, -2.7771, -0.4508, -1.1691, -1.2294,  1.5214,  0.9521, -0.9905,
         -1.2825, -0.9770,  0.2860,  0.6390, -0.7895,  0.5103,  1.2528, -1.5229,
         -0.1985,  0.7597,  0.6804]], grad_fn=<SliceBackward0>)


In [None]:
# First, let's look at unnormalized probabilities.
y_hat = X @ W
print(y_hat.shape)
print(y_hat[0:3])
print(y_hat[1].sum()) # they won't sum to zero, apart some rare cases

torch.Size([228146, 27])
tensor([[-0.4908,  1.0303, -0.2841, -0.8224, -0.3459, -1.2313, -1.2028,  0.2898,
          0.3738,  0.5329, -0.6490, -0.0796,  0.3531, -0.8806,  0.2581,  0.4152,
         -0.0859, -0.7320, -0.0279, -1.3311, -1.8962, -0.3533, -1.2447,  1.8507,
          1.2245,  0.3065, -1.2184],
        [ 0.5376, -1.1097,  0.3294, -0.4814,  0.7639,  1.2436,  0.1327,  0.4370,
          0.3091,  0.9484, -0.6145,  0.1337, -1.6202, -0.8083, -0.1803, -0.6315,
         -3.6680,  0.7417, -0.5151,  0.2471,  0.3862, -0.1556,  1.2671,  0.5975,
         -0.2727, -0.1975,  0.1396],
        [ 1.4480,  0.4760,  0.7686, -1.6357,  1.1872, -0.3883, -0.2518,  1.8967,
          1.1371, -0.7463, -0.3349,  1.4677, -0.2780,  0.8616,  1.0619, -1.8788,
         -1.8582, -0.5489, -0.2618,  0.0728,  0.6515, -1.4452,  0.2392, -1.7503,
          1.8052,  0.9609, -1.3850]], grad_fn=<SliceBackward0>)
tensor(-2.0399, grad_fn=<SumBackward0>)


In [None]:
# Then let's try with sotfmax'ed (normalized) probabilites.
from torch.nn.functional import softmax
y_hat_norm = softmax(X @ W, dim=1)
print(y_hat_norm.shape)
print(y_hat_norm[0:3])
print(y_hat_norm[1].sum()) # they should sum to zero as data are softmaxed

torch.Size([228146, 27])
tensor([[0.0193, 0.0884, 0.0238, 0.0139, 0.0223, 0.0092, 0.0095, 0.0422, 0.0459,
         0.0538, 0.0165, 0.0291, 0.0449, 0.0131, 0.0409, 0.0478, 0.0290, 0.0152,
         0.0307, 0.0083, 0.0047, 0.0222, 0.0091, 0.2008, 0.1074, 0.0429, 0.0093],
        [0.0497, 0.0096, 0.0404, 0.0180, 0.0624, 0.1008, 0.0332, 0.0450, 0.0396,
         0.0750, 0.0157, 0.0332, 0.0057, 0.0129, 0.0243, 0.0154, 0.0007, 0.0610,
         0.0174, 0.0372, 0.0427, 0.0249, 0.1031, 0.0528, 0.0221, 0.0238, 0.0334],
        [0.0859, 0.0325, 0.0435, 0.0039, 0.0662, 0.0137, 0.0157, 0.1345, 0.0629,
         0.0096, 0.0144, 0.0876, 0.0153, 0.0478, 0.0584, 0.0031, 0.0031, 0.0117,
         0.0155, 0.0217, 0.0387, 0.0048, 0.0256, 0.0035, 0.1227, 0.0528, 0.0051]],
       grad_fn=<SliceBackward0>)
tensor(1.0000, grad_fn=<SumBackward0>)


In [None]:
# Forward pass.
logits = X @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss = -probs[torch.arange(len(X)), Y].log().mean()
print(f'Loss: {loss.item()}')

# Backward pass, previously zeroing gradients.
W.grad = None
loss.backward()
W.grad[0:1]

Loss: 3.607822895050049


tensor([[ 0.0027, -0.0069, -0.0024, -0.0048, -0.0043, -0.0054, -0.0005,  0.0030,
          0.0026,  0.0050, -0.0083, -0.0089, -0.0006, -0.0093,  0.0007,  0.0050,
          0.0018,  0.0017, -0.0029, -0.0078, -0.0051,  0.0028, -0.0004,  0.0269,
          0.0145,  0.0037, -0.0028]])

In [None]:
  # Forward pass (using CrossEntropyLoss).
  from torch.nn import CrossEntropyLoss
  criterion = CrossEntropyLoss()  # we use this loss on unnormalized probabilities 

  y_hat = X @ W

  # Calculate and print loss.
  loss = criterion(y_hat, Y)
  print(f'Loss: {loss.item()}')

  # Backward pass, previously zeroing gradients.
  W.grad = None
  loss.backward()
  W.grad[0:1]

Loss: 3.6078226566314697


tensor([[ 0.0027, -0.0069, -0.0024, -0.0048, -0.0043, -0.0054, -0.0005,  0.0030,
          0.0026,  0.0050, -0.0083, -0.0089, -0.0006, -0.0093,  0.0007,  0.0050,
          0.0018,  0.0017, -0.0029, -0.0078, -0.0051,  0.0028, -0.0004,  0.0269,
          0.0145,  0.0037, -0.0028]])

In [None]:
  # Forward pass (let's use NLLLoss).
  from torch.nn import NLLLoss
  criterion = NLLLoss() # with NLLLoss we need softmax'ed values 
  y_hat = softmax(X @ W, dim=1)

  # Calculate and print loss.
  loss = criterion(y_hat, Y)
  print(f'Loss: {loss.item()}')

  # Backward pass, previously zeroing gradients.
  W.grad = None
  loss.backward()
  W.grad[0:1]

Loss: -0.040998440235853195


tensor([[ 9.2852e-05, -1.2840e-03, -2.1797e-05, -2.7065e-05, -5.8073e-05,
         -1.7537e-05,  2.8234e-05,  7.9035e-05,  4.4753e-05,  1.1917e-04,
         -9.5801e-05, -2.3839e-04, -9.3591e-05, -8.2644e-05, -8.8378e-06,
          1.4721e-04,  7.3826e-05,  6.6834e-05, -7.2955e-05, -3.5020e-05,
         -4.3893e-06,  9.8962e-05,  2.8710e-05,  6.9510e-04,  4.5305e-04,
          1.0554e-04,  6.8570e-06]])

In [None]:
# This allows us to optimize this simple model by iterating few times,
# calculating forward pass by matrix multiplication (and in case we want to use 
# NLLLoss aplying softmax), calculate loss, do the backward pass
# AND finally: update parameters using calculated gradient.

criterion = CrossEntropyLoss()

for i in range(200):
  # forward pass
  y_hat = X @ W

  # calculate and print loss
  loss = criterion(y_hat, Y)
  if (i+1) % 10 == 0:
    print(f'Loss at {i+1} iteration: {loss.item()}')

  # backward pass, previously zeroing gradients
  W.grad = None
  loss.backward()

  # parameters update
  W.data += -20 * W.grad


Loss at 10 iteration: 2.94762921333313
Loss at 20 iteration: 2.727717161178589
Loss at 30 iteration: 2.642289638519287
Loss at 40 iteration: 2.5973644256591797
Loss at 50 iteration: 2.5690245628356934
Loss at 60 iteration: 2.549506902694702
Loss at 70 iteration: 2.5353474617004395
Loss at 80 iteration: 2.5246477127075195
Loss at 90 iteration: 2.516289710998535
Loss at 100 iteration: 2.5095908641815186
Loss at 110 iteration: 2.5041098594665527
Loss at 120 iteration: 2.4995474815368652
Loss at 130 iteration: 2.4956905841827393
Loss at 140 iteration: 2.4923861026763916
Loss at 150 iteration: 2.4895222187042236
Loss at 160 iteration: 2.487015724182129
Loss at 170 iteration: 2.484804153442383
Loss at 180 iteration: 2.4828379154205322
Loss at 190 iteration: 2.4810805320739746
Loss at 200 iteration: 2.4795010089874268


In [None]:
# Now we are ready to use our model to predict few words
for _ in range(15):
  pred = torch.tensor([char2idx['.']])  # we start with the initial character
  pred_word = []  # but we do not add the initial character to the result
  while True:
    prev_char = one_hot(pred, num_classes=27).float()  # one-hot previous character
    p = softmax(prev_char @ W, dim=1)  # predict probability
    pred = torch.multinomial(p, num_samples=1, replacement=True).squeeze(1)  # sample from multinomial, note: we need to squeeze as multinomial adds extra dimention
    pred_ch = idx2char[pred.item()]  # calculate character
    if pred_ch == '.':  # if ending character -> break while True
      break
    pred_word.append(pred_ch)  # add character to the list
  print(''.join(pred_word))

myleirapeiosaenie
loa
enige
gavtavise
jan
aho
luramrison
atlea
jena
ton
aanieanaa
an
le
syada
n
