In [20]:
# this implementation follows WaveNet: A Generative Model for Raw Audio by Aaron et al. 2016

In [19]:
import torch
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt

import pickle
import random
random.seed(10110609)

%matplotlib inline

In [3]:
companies_df = pd.read_csv(
  "../data/cleansed_layer/companies_usa_size_over_10.csv", usecols=["name"]
)

companies = companies_df.name.to_list()

In [4]:
alphabet = sorted(set("".join(companies)))
alphabet.insert(0, '.')
len_alphabet = len(alphabet)
print(f"{len_alphabet=}")

strtoint = {j: i for i, j in enumerate(alphabet)}
inttostr = {i: j for i, j in enumerate(alphabet)}

len_alphabet=27


In [5]:
print("5 examples: ", companies[:5])
print(f"Length: {len(companies)}")
print(f"Max length: {max([len(c) for c in companies])}")
print(f"Min length: {min([len(c) for c in companies])}")
avg_len = sum([len(c) for c in companies]) / len(companies)
print(f"Avg length: {avg_len}")

5 examples:  ['equinoxys', 'biassync', 'taggpay', 'touchpointe', 'rxfit']
Length: 91073
Max length: 16
Min length: 3
Avg length: 8.164823822647767


In [6]:
random.shuffle(companies)

In [7]:
# build the dataset
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = strtoint[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

n1 = int(0.8*len(companies))
n2 = int(0.9*len(companies))
Xtr,  Ytr  = build_dataset(companies[:n1])
Xdev, Ydev = build_dataset(companies[n1:n2])
Xte,  Yte  = build_dataset(companies[n2:])

torch.Size([668185, 8]) torch.Size([668185])
torch.Size([83202, 8]) torch.Size([83202])
torch.Size([83281, 8]) torch.Size([83281])
