In [None]:
# We are going to make a new name generator =) We will start with a list of names and create an autoregressive character-level language model that generates names that sound like
# the names we are given.

#Start with importing Pytorch and matplotlib and placing the names.txt file in the same folder as this notebook
import torch
import matplotlib.pyplot as plt
words = open('names.txt', 'r').read().splitlines()

In [None]:
words[:10]

In [None]:
len(words)

In [None]:
min(len(w) for w in words)

In [None]:
max(len(w) for w in words)

In [None]:
# This initializes a dictionary b that we can use to see some statistics about all the names. We will iterate over all two character possibilites in the names and make them a dictionary key. 
# The values increase by one when we iterate over an example of the two character pair
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram,0) + 1

In [None]:
# Here we can see the most likely two character pairs. An 'n' followed by the end of the name appears the most often. An 'a' followed by an 'n' happens the third most often.
sorted(b.items(), key = lambda kv: -kv[1])

In [None]:
# Instead of dictionaries, Pytorch likes arrays (or tensors)
N = torch.zeros((27,27), dtype=torch.int32)

In [None]:
# This takes all the names in the .txt file, conncatenates into one massive string, throws out duplicates (with the set function), puts them in a list, and sorts them. 
# The result is the alphabet =) We call this chars

#stoi is the mapping of each string to an integer. This assigns a number to each letter
#itos is the inverse of stoi
#Instead of having special characters for the start and end of a name, we will just use the character '.' to represent both

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i,j].item(), ha="center", va="top", color='gray')
plt.axis('off')