In [110]:
words = open('names.txt', 'r').read().splitlines()

## Bigram probability approach

Creating bigrams - two letter objects capturing what letter is likely to follow another
adding a start and an end characters to capture what letter to start with and with which letter the word is likely to end the word

In [111]:
b={}
for w in words[:1]:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs,chs[:1]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) +1 # counting the number that a bigram occurs


storing the bigram information in a 2D array (pytorch tensors) where the rows are the first character and the columns are the second character

In [112]:
import torch

In [113]:
N = torch.zeros((27,27), dtype=torch.int32)

chars = sorted(list(set(''.join(words ))))
stoi = {s:i+1 for i,s in enumerate(chars)} #mapping of a character to an index
stoi['.'] = 0 # replacing special end and start characters with a single . representing the space between words
itos = {i:s for s,i in stoi.items()}
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2] # index of the character - defining position in the array
        N[ix1,ix2] += 1

In [114]:
P = (N+1).float() #convert to float cause we want to devide
#(N+1) is model smoothing give all possible bigrams at least 1 count removing infinities in log_likelyhood
P /= P.sum(1, keepdim=True) # normalize occurences , in-place operation has potential to be faster - doesnt create more memory under-the-hood


In [115]:
g = torch.Generator().manual_seed(2147484647)
for i in range(20):
    ix = 0
    out=[]
    while True:
        p=P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() #obtianing normal distribuition from probabilities 
        out.append(itos[ix])
        if ix ==0:
            break
    print(''.join(out))

nieran.
cele.
brolyah.
ananei.
egin.
baijeriesetengh.
bebynnojarzakia.
h.
n.
er.
xlah.
assaiavees.
be.
drikyly.
en.
d.
an.
ja.
ma.
am.


when printing the probabilities of bigrams in real words we want the total probability according to a well trained model to be close to 1 (maximize the likelyhood). we can use log(likelyhood) for convenience - total prob = product of all probabilities which will be a tiny number for many words - log_likelyhood is a more natural number. 
negative log likelyhood - good loss function minimum is 0. use the average -.

In [116]:
log_likelyhood = 0.0
n=0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2] 
        prob = P[ix1,ix2]
        log_likelyhood += torch.log(prob)
        n+=1
nll = -log_likelyhood
print(f'{nll/n}')


2.4543561935424805


## Neural network approach

creating training set

In [117]:
import torch
N = torch.zeros((27,27), dtype=torch.int32)

chars = sorted(list(set(''.join(words ))))
stoi = {s:i+1 for i,s in enumerate(chars)} #mapping of a character to an index
stoi['.'] = 0 # replacing special end and start characters with a single . representing the space between words
itos = {i:s for s,i in stoi.items()}

In [118]:
xs, ys = [],[]
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2] # index of the character - defining position in the array
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs) # input
ys = torch.tensor(ys) # output - second letter
num = xs.nelement()


right now, xs and ys are tensors of integer index values 0-27. these cant be used as input for NN - use one-hot encoding

In [119]:
import torch.nn.functional as F
W = torch.randn((27,27), requires_grad=True) #random weights - makes 27 neurons, after multiplying with xsoh
W.shape

torch.Size([27, 27])

In [120]:
xsoh = F.one_hot(xs, num_classes=27).float()
xsoh.shape

torch.Size([228146, 27])

gradient descent

In [122]:
for n in range(10):
#forward pass    
    xsoh = F.one_hot(xs, num_classes=27).float()
    logits = xsoh @ W #matrix multiplication - activation of the weights at the positions of xs, equivalent to log-counts
    count = logits.exp() # exponantiate the tensor element-wise giving an positive value for all, eqiuvalent to count
    prob = count / count.sum(1, keepdim=True) # normalize - SOFTMAX
    #prob[0] will give a vector of probabilities representing what is likely to follow the first letter
    #all operations are easily differentaible so also backpropagable 
    loss = -prob[torch.arange(num), ys].log().mean() #average nll used as loss, taking the probabilities of the next letter at the correct indecies for which we are training. When we input a word which we want to use as training data, the probabilities of interest are of characters within the word - these train the weights associated with the word
    print(loss.item())

#backward pass
    W.grad = None
    loss.backward() 

#update

    W.data += -10 * W.grad


3.8248064517974854
3.7358288764953613
3.6533102989196777
3.576688051223755
3.5056183338165283
3.43988299369812


KeyboardInterrupt: 

E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?
E06: meta-exercise! Think of a fun/interesting exercise and complete it.

## Trigram model


In [124]:
xs1, xs2, ys = [],[], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs,chs[1:],chs[2:]):
        ix1 = stoi[ch1] # index of the character - defining position in the array
        ix2 = stoi[ch2] 
        ix3 = stoi[ch3]
        # Do I need to stop here if ix3 == 0? Does zip remove rows where entries are empty?
        xs1.append(ix1)
        xs2.append(ix2)
        ys.append(ix3)


xs1 = torch.tensor(xs1) # input
xs2 = torch.tensor(xs2) # input
xs = torch.stack((xs1, xs2), dim=1)
ys = torch.tensor(ys) # output - third letter
num = xs.nelement()
num = int(num/2)

In [125]:
import torch.nn.functional as F
W = torch.randn((27,27,27), requires_grad=True) #random weights - makes 27 neurons, after multiplying with xsoh

# Convert (row, col) to a single index in a flattened 27x27 matrix
flat_index = xs[:, 0] * 27 + xs[:, 1]

# One-hot encode in a flattened space
one_hot_flat = F.one_hot(flat_index, num_classes=27 * 27)

# Reshape back to 27x27
one_hot_matrix = one_hot_flat.view(len(xs),27, 27).float()

In [134]:
for n in range(300):
#forward pass    
    logits = torch.einsum('bij,ijk->bk', one_hot_matrix, W) #matrix multiplication - activation of the weights at the positions of xs, equivalent to log-counts
    count = logits.exp() # exponantiate the tensor element-wise giving an positive value for all, eqiuvalent to count
    prob = count / count.sum(1, keepdim=True) # normalize - SOFTMAX
    #prob[0] will give a vector of probabilities representing what letter is likely to follow the first letter
    #all operations are easily differentaible so also backpropagable 
    loss = -prob[torch.arange(num), ys].log().mean() #average negative log likelyhood used as loss, taking the probabilities of the next letter at the correct indecies for which we are training. When we input a word which we want to use as training data, the probabilities of interest are of characters within the word - these train the weights associated with the word
    print(loss.item())

#backward pass
    W.grad = None
    loss.backward() 

#update

    W.data += -10 * W.grad


2.39247727394104
2.3920223712921143
2.391569137573242
2.3911173343658447
2.3906667232513428
2.3902175426483154
2.3897695541381836
2.3893229961395264
2.3888776302337646
2.3884339332580566
2.387991189956665
2.387550115585327
2.3871099948883057
2.386671543121338
2.3862340450286865
2.3857979774475098
2.3853628635406494
2.384929656982422
2.3844974040985107
2.384066343307495
2.383636474609375
2.3832080364227295
2.3827807903289795
2.382354736328125
2.381930112838745
2.3815064430236816
2.3810842037200928
2.3806631565093994
2.3802430629730225
2.379824638366699
2.3794071674346924
2.378990888595581
2.3785758018493652
2.378161668777466
2.377748966217041
2.3773372173309326
2.376926898956299
2.3765175342559814
2.3761096000671387
2.3757026195526123
2.3752965927124023
2.374892234802246
2.374488592147827
2.374086380004883
2.373685121536255
2.3732848167419434
2.3728857040405273
2.372487783432007
2.372091054916382
2.3716952800750732
2.37130069732666
2.3709070682525635
2.3705146312713623
2.370123386383056

In [158]:
g = torch.Generator()#.manual_seed(2147484647)
for i in range(20):
    douplex = torch.tensor([[0,0]])
    ix=0
    out=[]
    while True:
        one_hot_matrix = F.one_hot(douplex[-1, 0] * 27 + douplex[-1, 1], num_classes=27 * 27).view(1,27, 27).float()
        logits = torch.einsum('bij,ijk->bk', one_hot_matrix, W) 
        count = logits.exp()
        prob = count / count.sum(1, keepdim=True)
        ix = torch.multinomial(prob, num_samples=1, replacement=True, generator=g).item() #obtianing normal distribuition from probabilities 
        douplex[0, 0]=douplex[0, 1]
        douplex[0, 1]=ix
        #douplex.append(douplex[-1, :])
        out.append(itos[ix])
        if ix ==0:
            break
    print(''.join(out))

trionniwhskvglmagp.
wxfjlanda.
velie.
brbaqptjevertumivqehyanna.
bpzerey.
te.
cogzyla.
zgcffnzjdkxarionwatvcbkbnwlqvpazaudmmrtnd.
ami.
fart.
halrkdenianean.
vayn.
ya.
gophmyhuziyluoe.
essibxyden.
za.
chennafx.
tyor.
ton.
upuoeigh.
