In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import itertools
import tqdm
import random

%matplotlib inline

In [16]:
companies_df = pd.read_csv(
  "../data/cleansed_layer/companies_usa_size_over_10.csv", usecols=["name"]
)

companies = companies_df.name.to_list()

In [17]:
E = torch.zeros((27, 27), dtype=torch.int32)

In [18]:
alphabet = sorted(list(set(''.join(companies))))
strtoint = {s:i+1 for i,s in enumerate(alphabet)}
strtoint['.'] = 0
inttostr = {i:s for s,i in strtoint.items()}

In [19]:
# splitting data into train, dev and test sets
random.seed(10110609)
random.shuffle(companies)
print("5 Example after shuffling: ", companies[:5])

n1 = int(len(companies) * 0.8)
n2 = int(len(companies) * 0.9)

X_train = companies[:n1]
X_dev = companies[n1:n2]
X_test = companies[n2:]

len(X_train), len(_), len(X_test)

5 Example after shuffling:  ['managedoffice', 'idmatrixindia', 'sightmd', 'popshelf', 'jaroop']


(72858, 3, 9108)

In [20]:
for word in X_train:
  chs = ['.'] + list(word) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = strtoint[ch1]
    ix2 = strtoint[ch2]
    E[ix1, ix2] += 1    

In [21]:
P = (E+1).float()
P /= P.sum(1, keepdims=True)

In [29]:
def evaluate(dataset):
  n = 0
  log_likelihood = 0

  for word in dataset:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
      ix1 = strtoint[ch1]
      ix2 = strtoint[ch2]
      prob = P[ix1, ix2]
      logprob = torch.log(prob)
      log_likelihood += logprob
      n += 1

  print(f"{log_likelihood=}")
  neg_logl = -log_likelihood
  print(f"{neg_logl=}")
  loss = neg_logl/n
  print(f"{loss=}")
  
  return loss.item()

In [30]:
# compute the loss on the train and test set
print("Evaluation on the train set: ")
loss_train = evaluate(X_train)

print("\nEvaluation on the dev set: ")
loss_dev = evaluate(X_dev)

print("\nEvaluation on the test set: ")
loss_test = evaluate(X_test)

Evaluation on the train set: 
log_likelihood=tensor(-1819176.3750)
neg_logl=tensor(1819176.3750)
loss=tensor(2.7226)

Evaluation on the dev set: 
log_likelihood=tensor(-226748.8594)
neg_logl=tensor(226748.8594)
loss=tensor(2.7253)

Evaluation on the test set: 
log_likelihood=tensor(-226677.6562)
neg_logl=tensor(226677.6562)
loss=tensor(2.7218)


In [31]:
# Sample from the model
g = torch.Generator().manual_seed(10110609)
samples = []

for i in range(20):
  word = ''
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    letter = inttostr[ix]

    if letter == '.':
      break
  
    word += letter
  samples.append(word)
  print(word)

tietenus
malm
artap
ayera
mebali
uswe
halloum
paruis
joa
ftrtx
ts
b
brces
s
mpmerictre
tlvil
rdelerquatepa
dicystedicherdr
mino
ec
