In [17]:
import itertools
import random
import pickle
import tqdm

import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

%matplotlib inline

In [2]:
companies_df = pd.read_csv(
  "../data/cleansed_layer/companies_usa_size_over_10.csv", usecols=["name"]
)

companies = companies_df.name.to_list()

In [3]:
print("5 examples: ", companies[:5])
print(f"Length: {len(companies)}")
print(f"Max length: {max([len(c) for c in companies])}")
print(f"Min length: {min([len(c) for c in companies])}")
avg_len = sum([len(c) for c in companies]) / len(companies)
print(f"Avg length: {avg_len}")

5 examples:  ['equinoxys', 'biassync', 'taggpay', 'touchpointe', 'rxfit']
Length: 91073
Max length: 16
Min length: 3
Avg length: 8.164823822647767


In [4]:
alphabet = sorted(set("".join(companies)))
alphabet.insert(0, '.')
len_alphabet = len(alphabet)
print(f"{len_alphabet=}")

combinations = list(itertools.product(alphabet, repeat=2))
combinations = [''.join(comb) for comb in combinations]
print(f"{len(combinations)=}")

strtoint = {j: i for i, j in enumerate(alphabet)}
inttostr = {i: j for i, j in enumerate(alphabet)}

strtoint_bi = {j: i for i, j in enumerate(combinations)}
inttostr_bi = {i: j for i, j in enumerate(combinations)}

len_alphabet=27
len(combinations)=729


In [5]:
print(torch.cuda.is_available())

True


In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [1]:
# CUDA was enable on the kaggle kernel

In [7]:
def build_dataset(words):
  xs, ys = [], []
  for word in words:
    word = ['.', '.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
      ix1 = strtoint_bi[ch1+ch2]
      ix2 = strtoint[ch3]
      xs.append(ix1)
      ys.append(ix2)
      
  xs = torch.tensor(xs, device=device)
  ys = torch.tensor(ys, device=device)

  return xs, ys

In [8]:
# splitting data into train, dev and test sets
random.seed(10110609)
random.shuffle(companies)
print("5 Example after shuffling: ", companies[:5])

n1 = int(len(companies) * 0.8)
n2 = int(len(companies) * 0.9)

X_train, y_train = build_dataset(companies[:n1])
X_dev, y_dev = build_dataset(companies[n1:n2])
X_test, y_test = build_dataset(companies[n2:])

train_size = X_train.nelement()
test_size = X_test.nelement()
dev_size = X_dev.nelement()

len(X_train), len(X_dev), len(X_test)

5 Example after shuffling:  ['managedoffice', 'idmatrixindia', 'sightmd', 'popshelf', 'jaroop']


(668185, 83202, 83281)

In [9]:
torch.cuda.device(0)

<torch.cuda.device at 0x7dee148e7d30>

In [10]:
torch.cuda.empty_cache()

In [11]:
seed = torch.Generator().manual_seed(10110609)
W = torch.randn((len_alphabet*len_alphabet, len_alphabet), requires_grad=True, device=device)

In [None]:
W.shape

In [None]:
# xenc @ W
# (ts, 729) @ (729, 27) => (ts, 27)

In [None]:
# forward propogation
xenc = F.one_hot(X_train, num_classes=len_alphabet*len_alphabet).float().to(device)
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(train_size, device=device), y_train].log().mean()

In [None]:
loss.item()

In [None]:
# backward propogation
W.grad = None
loss.backward()

In [None]:
# update the parameters
W.data += -150 * W.grad

In [None]:
# 3.755063533782959
# 3.754981517791748
# 3.754899740219116
# 3.7548177242279053
# 3.715470552444458

In [None]:
# loss=tensor(2.4969) for trigram model
# I expect to see this number by the end of neural net training

In [12]:
def evaluate(X, y, size):
    xenc = F.one_hot(X, num_classes=len_alphabet*len_alphabet).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(size), y].log().mean() + 0.01*(W**2).mean() # last part is regularization
    return loss

In [13]:
seed = torch.Generator().manual_seed(10110609)
W = torch.randn((len_alphabet*len_alphabet, len_alphabet), requires_grad=True, device=device)

In [14]:
step = 0
lossi, idxs = [], []

In [15]:
xenc.shape

NameError: name 'xenc' is not defined

In [16]:
for i in range(1000):
    
    # forward propogation
    xenc = F.one_hot(X_train, num_classes=len_alphabet*len_alphabet).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(train_size), y_train].log().mean() + 0.01*(W**2).mean() # last part is regularization
    
    # evalute loss on the dev set
    loss_dev = evaluate(X_dev, y_dev, dev_size)
    
    if i % 100 == 0:
        print(f"{i} / train: {loss.item()} / dev: {loss_dev.item()}")
    
    # backward propogation
    W.grad = None
    loss.backward()

    # update the parameters
    W.data += -10 * W.grad
    
    # track params
    lossi.append(loss.item())
    idxs.append(step)
    step += 1

0 / train: 3.784008026123047 / dev: 3.7823433876037598
100 / train: 3.3226256370544434 / dev: 3.3217177391052246
200 / train: 3.1018824577331543 / dev: 3.1018524169921875
300 / train: 2.9679830074310303 / dev: 2.9687745571136475
400 / train: 2.8798508644104004 / dev: 2.881415367126465
500 / train: 2.8184566497802734 / dev: 2.820681095123291
600 / train: 2.7737903594970703 / dev: 2.7765755653381348
700 / train: 2.7400474548339844 / dev: 2.7433245182037354
800 / train: 2.7136728763580322 / dev: 2.717388391494751
900 / train: 2.6924641132354736 / dev: 2.696577310562134


In [None]:
# 6000 with -50
# 2000 with -10 lr decay

In [None]:
# Trigram probability based model metrics:

# Evaluation on the train set: 
# log_likelihood=tensor(-1656800.5000)
# neg_logl=tensor(1656800.5000)
# loss=tensor(2.4796)

# Evaluation on the dev set: 
# log_likelihood=tensor(-207735.1562)
# neg_logl=tensor(207735.1562)
# loss=tensor(2.4968)

# Evaluation on the test set: 
# log_likelihood=tensor(-208261.5000)
# neg_logl=tensor(208261.5000)
# loss=tensor(2.5007)

In [None]:
plt.plot(idxs, lossi);

## Model parameter tuning (regularization)

In [12]:
losses = []
Wreg = []
regi = [0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]

In [13]:
for reg in regi:
    
    step = 0
    lossi, idxs = [], []
    
    # Initialize the network
    seed = torch.Generator().manual_seed(10110609)
    W = torch.randn((len_alphabet*len_alphabet, len_alphabet), requires_grad=True, device=device)
    print(f"\nInitialized the network, {reg=}")
    
    for i in range(7000):

        # forward propogation
        xenc = F.one_hot(X_train, num_classes=len_alphabet*len_alphabet).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        loss = -probs[torch.arange(train_size), y_train].log().mean() + reg*(W**2).mean() # last part is regularization

        # evalute loss on the dev set
        loss_dev = evaluate(X_dev, y_dev, dev_size)

        if i % 200 == 0:
            print(f"{i} / train: {loss.item()} / dev: {loss_dev.item()}")

        # backward propogation
        W.grad = None
        loss.backward()

        # update the parameters
        lr = 50 if i < 5000 else 10 # learning rate decay
        W.data += -lr * W.grad

        # track params
        lossi.append(loss.item())
        idxs.append(step)
        step += 1
        
    losses.append([idxs, lossi])
    Wreg.append(W)


Initialized the network, reg=0
0 / train: 3.816410541534424 / dev: 3.826584577560425
200 / train: 2.6678175926208496 / dev: 2.682584762573242
400 / train: 2.5806243419647217 / dev: 2.5983898639678955
600 / train: 2.547790765762329 / dev: 2.567223310470581
800 / train: 2.530346632003784 / dev: 2.5510334968566895
1000 / train: 2.519455671310425 / dev: 2.541217803955078
1200 / train: 2.512000322341919 / dev: 2.5347230434417725
1400 / train: 2.5065770149230957 / dev: 2.5301685333251953
1600 / train: 2.50246000289917 / dev: 2.5268375873565674
1800 / train: 2.499232530593872 / dev: 2.524322032928467
2000 / train: 2.4966375827789307 / dev: 2.5223758220672607
2200 / train: 2.494507074356079 / dev: 2.5208399295806885
2400 / train: 2.492727279663086 / dev: 2.5196094512939453
2600 / train: 2.491218328475952 / dev: 2.518611192703247
2800 / train: 2.4899227619171143 / dev: 2.517793893814087
3000 / train: 2.4887993335723877 / dev: 2.517120122909546
3200 / train: 2.487816095352173 / dev: 2.516562461

## Export the Models

In [None]:
# export the loss metrics
with open("losses.pkl", "wb") as file:
    pickle.dump(losses, file)

In [None]:
with open("Wreg.pkl", "wb") as file:
    pickle.dump(Wreg, file)

## Import the model and metrics

In [None]:
with open("losses.pkl", "rb") as file:
  losses = pickle.load(file)
  
with open("Wreg.pkl", "rb") as file:
  Wreg = pickle.load(file)