In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wordzz/companies_cleaned.csv


In [17]:
import pandas as pd
import torch
import matplotlib.pyplot as plt

%matplotlib inline

In [18]:
companies_df = pd.read_csv(
  "../data/companies_cleaned.csv", usecols=["name"]
)

companies = companies_df.name.to_list()

In [19]:
len(companies)

126021

In [20]:
companies[:5]

['ibm', 'walmart', 'microsoft', 'pwc', 'deloitte']

In [21]:
min([len(c) for c in companies])

2

In [22]:
max([len(c) for c in companies])

20

In [23]:
sum([len(c) for c in companies]) / len(companies)

8.524261829377643

In [24]:
alphabet = sorted(set("".join(companies)))
alphabet.insert(0, '.')
num_letters = len(alphabet)
num_letters

27

In [25]:
import itertools
combinations = list(itertools.product(alphabet, repeat=2))
combinations = [''.join(comb) for comb in combinations]
len(combinations)

729

In [26]:
strtoint = {j: i for i, j in enumerate(alphabet)}
inttostr = {i: j for i, j in enumerate(alphabet)}

strtoint_bi = {j: i for i, j in enumerate(combinations)}
inttostr_bi = {i: j for i, j in enumerate(combinations)}

# Trigram Model

In [48]:
import tqdm

In [51]:
E = torch.zeros((len(alphabet) * len(alphabet), len(alphabet)), dtype=torch.int32)

In [52]:
for word in tqdm.tqdm(companies):  
  word = ['.', '.'] + list(word) + ['.']
  for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
    int1, int2, int3 = strtoint[ch1], strtoint[ch2], strtoint[ch3]
    int12 = strtoint_bi[ch1+ch2]
    E[int12, int3] += 1

100%|██████████| 126021/126021 [00:21<00:00, 5803.91it/s]


In [54]:
P = (E+1).float() # +1 is for model smoothing
P /= P.sum(axis=1, keepdims=True)

In [57]:
P.shape

torch.Size([729, 27])

In [59]:
P[0].sum()

tensor(1.)

In [60]:
# compute the loss on the entire dataset or a single string

n = 0
log_likelihood = 0

# for word in ["openai"]:
for word in companies:
  word = ['.', '.'] + list(word) + ['.']
  for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
    ix1 = strtoint_bi[ch1+ch2]
    ix2 = strtoint[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

print(f"{log_likelihood=}")
neg_logl = -log_likelihood
print(f"{neg_logl=}")
loss = neg_logl/n
print(f"{loss=}")

log_likelihood=tensor(-2968869.2500)
neg_logl=tensor(2968869.2500)
loss=tensor(2.4735)


In [62]:
# sampling from the model
g = torch.Generator().manual_seed(10110609)

for i in range(20):
    letters = '..'

    word = ''

    while True:
        idx = strtoint_bi[letters]
        # p = E[idx].float()
        # p = p / p.sum()
        p = P[idx]
        next_letter_ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g)
        letter = inttostr[next_letter_ix.item()]

        if letter == '.':
            break

        word += letter
        letters = letters[1:] + letter

    print(word)

news
maibervemacer
ron
tegoft
cointsouseals
tv
xtioudisonvismadvienb
clestenwedon
lorly
chrce
derfamealm
perntrappyressegroosayrogink
smitic
clivareptoskel
synanue
myduce
waysitythinne
frionessaw
aumeeducar
ica


# Neural Net

In [27]:
def build_dataset(words):
  xs, ys = [], []
  for word in words:
    word = ['.', '.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
      ix1 = strtoint_bi[ch1+ch2]
      ix2 = strtoint[ch3]
      xs.append(ix1)
      ys.append(ix2)
      
  xs = torch.tensor(xs)
  ys = torch.tensor(ys)

  return xs, ys

In [28]:
# splitting data into train, dev and test sets
import random

random.shuffle(companies)

n1 = int(len(companies) * 0.8)
n2 = int(len(companies) * 0.9)

X_train, y_train = build_dataset(companies[:n1])
X_dev, y_dev = build_dataset(companies[n1:n2])
X_test, y_test = build_dataset(companies[n2:])

train_size = X_train.nelement()

len(X_train), len(X_dev), len(X_test)

(959832, 120091, 120334)

In [29]:
seed = torch.Generator().manual_seed(10110609)
W = torch.randn((num_letters*num_letters, num_letters), requires_grad=True)

In [30]:
W.shape

torch.Size([729, 27])

In [31]:
import torch.nn.functional as F

In [28]:
print(torch.cuda.is_available())

False


In [None]:
# xenc @ W
# (ts, 729) @ (729, 27) => (ts, 27)

In [21]:
# forward propogation
xenc = F.one_hot(X_train, num_classes=num_letters*num_letters).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(train_size), y_train].log().mean()

In [63]:
loss.item()

3.5262298583984375

In [64]:
# backward propogation
W.grad = None
loss.backward()

In [65]:
# update the parameters
W.data += -150 * W.grad

In [66]:
# 3.755063533782959
# 3.754981517791748
# 3.754899740219116
# 3.7548177242279053
# 3.715470552444458

In [None]:
# loss 2.4736 for probability based trigram model
# I expect to see this number by the end of neural net training