<a href="https://colab.research.google.com/github/Bipin2005-dev/Architecting-LLMs-WiDS/blob/main/week2/week2_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Exercise: Making an interpolated bigram-trigram model. Here, the probability of a letter occuring would be lambda * P_tri + (1 - lambda) * P_bi. We tune lambda on the dev set, and then test on the test set


Training set(80%), dev set(10%), test set(10%)

In [39]:
from google.colab import files

files.upload()

{}

In [40]:
import random

names = open('names.txt', 'r').read().splitlines()
random.shuffle(names)
total_entries = len(names)
training = names[:int(0.8 * total_entries)]
dev = names[int(0.8 * total_entries): int(0.9 * total_entries)]
test = names[int(0.9 * total_entries):]

In [41]:
stoi = {ch:i for i,ch in enumerate(".abcdefghijklmnopqrstuvwxyz")}
itos = {i:ch for ch,i in stoi.items()}

In [42]:
# Make the bigram and trigram language tables
import torch

bigram_table = torch.zeros(27,27)
trigram_table = torch.zeros(27,27,27)
bigram_table += 0.01
trigram_table += 0.01 # Smoothing

for word in training:
  big_w = ['.'] + list(word) + ['.']
  tri_w = ['.', '.'] + list(word) + ['.', '.']

  for ch1, ch2 in zip(big_w, big_w[1:]):
    bigram_table[stoi[ch1], stoi[ch2]] += 1
  for ch1, ch2, ch3 in zip(tri_w, tri_w[1:], tri_w[2:]):
    trigram_table[stoi[ch1], stoi[ch2], stoi[ch3]] += 1

In [43]:
# Normalization
bigram_table /= bigram_table.sum(1, True)
trigram_table /= trigram_table.sum(2, True)

In [50]:
# Tuning of lambda on the dev set
best_l = 0.5
best_l_nll = 20.0
for l in torch.linspace(0.0, 1.0, 101):
  l += 0.01
  nll = 0.0
  n = 0
  for word in dev:
    ix1, ix2 = 0, 0
    for char in list(word) + ['.']:
      n += 1
      ix3 = stoi[char]
      p_big = bigram_table[ix2, ix3]
      p_tri = trigram_table[ix1,ix2,ix3]
      p = l * p_tri + (1 - l) * p_big
      nll -= torch.log(p)
      ix1, ix2 = ix2, ix3
  nll /= n
  if nll < best_l_nll:
    best_l = l
    best_l_nll = nll
  print(nll.item(), l.item())

print(best_l)

2.4478237628936768 tensor(0.0100)
2.4424374103546143 tensor(0.0200)
2.437211513519287 tensor(0.0300)
2.43217396736145 tensor(0.0400)
2.4272263050079346 tensor(0.0500)
2.4224157333374023 tensor(0.0600)
2.417741537094116 tensor(0.0700)
2.4131577014923096 tensor(0.0800)
2.4087038040161133 tensor(0.0900)
2.4042701721191406 tensor(0.1000)
2.3999950885772705 tensor(0.1100)
2.395796537399292 tensor(0.1200)
2.3916685581207275 tensor(0.1300)
2.387639045715332 tensor(0.1400)
2.383652687072754 tensor(0.1500)
2.3797523975372314 tensor(0.1600)
2.3759350776672363 tensor(0.1700)
2.372166395187378 tensor(0.1800)
2.3685107231140137 tensor(0.1900)
2.364885091781616 tensor(0.2000)
2.3613507747650146 tensor(0.2100)
2.357858419418335 tensor(0.2200)
2.354438304901123 tensor(0.2300)
2.3510665893554688 tensor(0.2400)
2.3477323055267334 tensor(0.2500)
2.344433307647705 tensor(0.2600)
2.341256856918335 tensor(0.2700)
2.338122606277466 tensor(0.2800)
2.334991693496704 tensor(0.2900)
2.3319785594940186 tensor(0.3

In [53]:
# Now use this tuned lambda to evaluate the loss on the test set
l = best_l.item()

nll = 0
n = 0
for word in test:
  ix1 = 0
  ix2 = 0
  for char in list(word) + ['.']:
    ix3 = stoi[char]
    p_big = bigram_table[ix2, ix3]
    p_tri = trigram_table[ix1,ix2,ix3]
    p = l * p_tri + (1 - l) * p_big
    nll -= torch.log(p)
    n += 1

nll /= n
print(f'{nll.item():.4f}')


4.7853
