In [None]:
import numpy
import pandas as pd
import torch
import itertools
import random

In [None]:
names = open('Indian_Names.csv', 'r').read().split()

In [None]:
len(names)
min(len(i) for i in names)
max(len(i) for i in names)

In [None]:
# Let's try to do this bigram stuff by ourselves first --> Only look at letter pairs and do a lookup.
# Step 1: In the names dataset, get a frequency ordered list of all possible 2 letter combos
substring_dict = {}

for name in names:
    for i in range(len(name)-1):
        name_substring = name[i]+name[i+1]
        if name_substring not in substring_dict:
            substring_dict[name_substring]  = 1
        else:
            substring_dict[name_substring] += 1

print(substring_dict)


#Step 2: Let's try doing this w/o a neural net. We'll take a seed letter, a determined word length and 
# keep adding letters based on weighted probability

min_name_length = min(len(i) for i in names)
max_name_length = max(len(i) for i in names)
seedstring = 'abcdefghijklmnopqrstuvwxyz'

#Randomised inputs
seed_letter = seedstring[random.randint(0,len(seedstring)-1)]
word_length = random.randint(min_name_length, max_name_length)


#Rather than a simple substring and frequency count, it would make sense to break the dictionary into further pieces like : 
# columns: first letter | next letter | frequency | normalized probability 
optimized_substring_dict = {}
for bigram_str, count in substring_dict.items():
    first_letter = bigram_str[0]
    next_letter = bigram_str[1]

    if first_letter not in optimized_substring_dict:
        optimized_substring_dict[first_letter] = []
    
    optimized_substring_dict[first_letter].append((next_letter, count))

for first_letter in optimized_substring_dict:
    total_count = sum(count for _, count in optimized_substring_dict[first_letter])

    optimized_substring_dict[first_letter] = [
        (next_letter, count, count/total_count)
        for next_letter, count in optimized_substring_dict[first_letter]
    ]

print(optimized_substring_dict['a'])

#Function to generate a name given a seed letter, a word length and a target dictionary
def generate_name(seed_letter, word_length, target_dictionary):
    name = seed_letter
    current_letter = seed_letter

    for i in range (word_length-1):
        if current_letter not in target_dictionary:
            break

        next_options = target_dictionary[current_letter]

        letters = [option[0] for option in next_options]
        probabilities = [option[2] for option in next_options]

        next_letter = random.choices(letters, weights = probabilities)[0]

        name += next_letter
        current_letter = next_letter
    
    return name

generate_name(seed_letter, word_length, optimized_substring_dict)

        
for _ in range(10):
    seed_letter = seedstring[random.randint(0,len(seedstring)-1)]
    word_length = random.randint(min_name_length, max_name_length)
    name = generate_name(seed_letter, word_length, optimized_substring_dict)
    print(name)

In [None]:
#Lmao that didn't work too great. So let's now look at how Andrej does it. 
# Difference 1: for bigrams, he's added a start and end token for each name.

bigram_dictionary = {}

for name in names:
    #Add in start and end characters for each name
    full_charlist = ['<S>'] + list(name) + ['<E>']
    for char1, char2 in zip(full_charlist, full_charlist[1:]):
        bigram = (char1, char2)
        bigram_dictionary[bigram] = bigram_dictionary.get(bigram, 0) + 1
        print (char1, char2)

In [None]:
# sorted(bigram_dictionary.items(), key = lambda kv: -kv[1])

In [None]:
#Better to handle this in pytorch than dictionaries
#There are 28x28 potential bigrams (26 letters + 2 special start/end chars)


#Let's get the list of all chars in the bigram_dictionary
import torch
names_array = torch.zeros((27,27), dtype=torch.int32)

character_list = sorted(list(set(''.join(names))))
string_to_int = []
string_to_int = {s:i+1 for i, s in enumerate(character_list)}
string_to_int['.']=0

print(string_to_int)

for name in names:
    chars = ['.']+list(name)+['.']
    for char1, char2 in zip(chars, chars[1:]):
        index1 = string_to_int[char1]
        index2 = string_to_int[char2]
        names_array[index1, index2] += 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(names_array)
plt.show()

In [None]:
#Want a better visualization? Don't really need it but let's go I guess lol

#string_to_int becomes int_to_string here

int_to_string = {i:s for s,i in string_to_int.items()}
int_to_string

%matplotlib inline
plt.figure(figsize = (16,16))
plt.imshow(names_array, cmap = 'Reds')
for i in range(27):
    for j in range(27):
        chstr = int_to_string[i] + int_to_string[j]
        plt.text(j, i, chstr, ha = 'center', va='bottom', color='gray')
        plt.text(j,i, names_array[i,j].item(), ha = 'center', va='top', color='gray')
plt.axis('off')



In [None]:
names_array[0]

In [280]:
generated_name = []
ix = 0
generation = torch.Generator().manual_seed(random.randint(0,991934359))

#Now we have it normalized. In our method, we used random.choice() using probabilities. Pytorch has multinomial
#rather than recalculating and renormalizing the arrays everytime, let's create a full array holding all this at once. (check the commented lines)

Final_prob_array = (names_array+1).float()
Final_prob_array /= Final_prob_array.sum(1, keepdim=True)


for i in range(10):
    while True:
        probability_array = Final_prob_array[ix]    
        ix = torch.multinomial(probability_array, num_samples=1, replacement=True, generator=generation).item()
        generated_name.append(int_to_string[ix])
        if ix == 0:
            break

    print(''.join(generated_name[:-1]))
    generated_name = []

sh
jelohatoegirelapa
pha
ellele
den
m
desen
ie
mawisopeeeni
rllude


In [None]:
# Let's now try to evaluate the quality of this "model". How can we take our data and define a loss function to optimize on?

#Actually could be fun to take the 27x27 as our entire neural network and sort of try to make it overfit our data. 
# Say we start with these as our biases. 
# But what about the weights?


# Measuring quality

# If we picked totally at random, the chance of any bigram pair should be 1/27 ~ 0.037
# But as we can see, the model is assigning higher than 0.037 probability to many pairs in the training set, but not all.
# Ideally, in a perfect world, the model would assign 1 to every bigram pair shown in the training dataset.
# If we were to take one number to measure the quality of this model - from statistical theory - it's called likelihood
# Likelihood is the product of all these probabilities. It should be as high as possible. But these numbers make it unwieldy
# So we use log likelihood. But log likelihood here is negative, and the closer to zero it is the better for us. 
# So we'll invert it, and for a loss function we use the average of the negative log likelihood

log_likelihood = 0.0
n=0
for name in names:
    chars = ['.']+list(name)+['.']
    for char1, char2 in zip(chars, chars[1:]):
        index1 = string_to_int[char1]
        index2 = string_to_int[char2]
        prob = Final_prob_array[index1, index2]
        log_prob = torch.log(prob)
        log_likelihood += log_prob
        n += 1
        print(f'{char1}{char2}: {prob:.3f} {log_prob:.3f}')

print(f'{log_likelihood=}')
negative_log_likelihood = -log_likelihood
print(f'{negative_log_likelihood = }')
avg_nll = negative_log_likelihood/n
print(f'{avg_nll:.3f}')

#you can just enter a name here and check. my name was around 2.3, so not too bad. within distribution types
#If you enter nonsensical stuff like 'ahkqxsd', the loss becomes infinity since qx isn't in the distribution
# To clean this, people do model smoothing, viz. add some small number like 1 to all the occurrences across the bigram frequency distribution
# The more you add, the "smoother" your model gets, obviously.


# Doing this with neural nets 

In [263]:
# We now want to do this with neural nets. We know the training data - bigrams from our examples.
# Let's create the training data (x,y)

xs, ys = [], []

for name in names:
    chars = ['.'] + list(name) + ['.']
    for char1, char2 in zip(chars, chars[1:]):
        index1 = string_to_int[char1]
        index2 = string_to_int[char2]
        #print(char1, char2)
        xs.append(index1)
        ys.append(index2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples', num)
#xs is our preceding letter in the bigram and ys is the succeeding letter. So our NN should know that following 1: 1,2,14 should have high probabilities
#Feeding in integer values into an NN is done via one hot encoding

import torch.nn.functional as F
x_enc = F.one_hot(xs, num_classes=27).float()
#print(x_enc)

#Remember to cast one hot encoding of xs to float32

number of examples 253317


In [None]:
#Let's initialize randomized weights
W = torch.randn((27,27))
x_enc @ W

#x_enc @ W is the matrix multiplication in pytorch
print(f'W.shape is {W.shape}, x_enc.shape is {x_enc.shape}')

# x_enc @ W is 6, 27 * 27,27 = 6,27 matrix

tensor([[-2.1249e-01,  1.4354e-01, -7.2369e-01,  ..., -2.0935e-01,
          6.0515e-01, -8.6556e-01],
        [ 3.2015e-01,  3.0683e-02, -9.9661e-01,  ..., -4.3654e-02,
         -1.1543e+00, -1.4847e+00],
        [ 3.2015e-01,  3.0683e-02, -9.9661e-01,  ..., -4.3654e-02,
         -1.1543e+00, -1.4847e+00],
        ...,
        [ 3.6177e-01, -9.7197e-01,  1.0948e-01,  ..., -1.7869e-03,
         -7.5519e-01,  1.1843e+00],
        [-3.0267e-01,  1.6177e+00, -1.0565e+00,  ...,  9.3357e-01,
         -1.1384e+00,  1.4911e+00],
        [ 7.4620e-01, -6.9121e-01, -1.9286e+00,  ..., -8.5729e-01,
         -3.8481e-01, -8.2572e-02]])

In [264]:
nlls = torch.zeros(6)

for i in range(6):
    x = xs[i].item()
    y = ys[i].item()
    print('---------------------------------')
    print(f'Bigram example {i+1}: {int_to_string[x]}{int_to_string[y]} (indexes {x}, {y})')
    print(f'input to the neural net: {x}')
    print(f'Output probabilities from the neural net: {probs[i]}')
    print(f'Label: Actual next character: {y}')
    p = probs[i,y]
    print(f'Probability assigned by the neural net to the actual next character: {p.item()}')
    logp = torch.log(p)
    print(f'Log likelihood: {logp.item()}')
    nll = -logp
    print(f'Negative log likelihood: {nll.item()}')
    nlls[i] = nll

    print('====================')
    print('average negative log likelihood, i.e  loss = ',nlls.mean().item())

---------------------------------
Bigram example 1: .a (indexes 0, 1)
input to the neural net: 0
Output probabilities from the neural net: tensor([0.0221, 0.0315, 0.0132, 0.0483, 0.1499, 0.0171, 0.1027, 0.0530, 0.0614,
        0.0098, 0.0191, 0.0056, 0.0421, 0.0454, 0.0647, 0.0091, 0.0100, 0.0113,
        0.0322, 0.0191, 0.0483, 0.0257, 0.0515, 0.0233, 0.0221, 0.0500, 0.0115])
Label: Actual next character: 1
Probability assigned by the neural net to the actual next character: 0.03151710703969002
Log likelihood: -3.4572248458862305
Negative log likelihood: 3.4572248458862305
average negative log likelihood, i.e  loss =  0.5762041211128235
---------------------------------
Bigram example 2: aa (indexes 1, 1)
input to the neural net: 1
Output probabilities from the neural net: tensor([0.0381, 0.0285, 0.0102, 0.1508, 0.0055, 0.0577, 0.0089, 0.0484, 0.0197,
        0.0173, 0.1361, 0.0250, 0.0114, 0.0644, 0.0453, 0.0256, 0.0350, 0.0720,
        0.0289, 0.0104, 0.0041, 0.0395, 0.0037, 0.0719,

In [271]:
#randomly initialize  27 neurons weights. each neuron receives 27 inputs
generation = torch.Generator().manual_seed(214782235)
W = torch.randn((27, 27), generator = generation, requires_grad=True)

for k in range(400):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts/counts.sum(1, keepdims=True)
    #Loss
    loss_funk = -probs[torch.arange(num), ys].log().mean()
    print(loss_funk.item())

    #Backward pass
    W.grad = None
    loss_funk.backward()

    #Parameter update
    W.data += -50*W.grad


3.902346134185791
3.4767067432403564
3.25004506111145
3.10648250579834
3.001192092895508
2.9214439392089844
2.8598499298095703
2.811532974243164
2.7730560302734375
2.741954803466797
2.716430187225342
2.6951582431793213
2.6771771907806396
2.661797523498535
2.6485068798065186
2.6369099617004395
2.6266942024230957
2.6176159381866455
2.609485626220703
2.602159023284912
2.595524549484253
2.589493989944458
2.583996057510376
2.5789716243743896
2.574369430541992
2.5701465606689453
2.5662639141082764
2.562687397003174
2.55938720703125
2.5563361644744873
2.5535106658935547
2.550889015197754
2.5484516620635986
2.5461816787719727
2.5440633296966553
2.5420825481414795
2.5402274131774902
2.5384867191314697
2.5368499755859375
2.535308837890625
2.5338544845581055
2.5324807167053223
2.5311803817749023
2.5299479961395264
2.528777837753296
2.527665615081787
2.5266077518463135
2.525599241256714
2.524637460708618
2.5237185955047607
2.5228402614593506
2.5219995975494385
2.5211946964263916
2.5204226970672607

In [None]:
#Final check. Let's sample the neural net model as well.

for i in range(10):
    out = []
    index = 0
    while True:
        xenc = F.one_hot(torch.tensor([index]), num_classes = 27).float()
        logits = xenc @ W
        counts = logits.exp()

        p = counts/counts.sum(1, keepdim=True)

        index = torch.multinomial(p, num_samples=1, replacement=True, generator=generation).item()
        out.append(int_to_string[index])

        if index == 0:
            break
    print(''.join(out))

    #Seems good to go. Equally nonsensical name outputs.

araida.
stranomizacharajhanamano.
jire.
diesqud.
ndryi.
rlaniriavquiyaya.
deniraliaba.
z.
juanai.
jrlaitviana.
