In [26]:
places = open('data\places.txt').read().splitlines()
places = [p.lower() for p in places]

  places = open('data\places.txt').read().splitlines()


In [None]:
words = open('names.txt').read().splitlines()
words[:10]

In [None]:
len(words)  # Total number of words in the dataset

# Bigram Language Model : Work only on two characters at a time
# predict the next character given a previous one


In [None]:
b = {}
for w in words:
    chs=['<S>'] + list(w) + ['<E>']           
    for ch1,ch2 in zip(chs, chs[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram, 0) + 1
        # print(ch1,ch2)

In [None]:
sorted(b.items(),key = lambda kv : -kv[1])

In [27]:
import torch

In [28]:
input_data= places

In [29]:
chars = sorted(list(set(''.join(input_data)))) # first we create a list of unique characters
s_to_i = { s : i+1 for i,s in enumerate(chars)} # create a mapping from character to index
s_to_i['#'] = 0
i_to_s= {i:s for s,i in s_to_i.items()} # create a mapping from index to character


In [30]:
no_distinct_chars = len(s_to_i)  # number of distinct characters
N = torch.zeros((no_distinct_chars, no_distinct_chars),dtype=torch.int32)

In [31]:

for w in input_data:
    chs=['#'] + list(w) + ['#']           
    for ch1,ch2 in zip(chs, chs[1:]):
        ix1= s_to_i[ch1]
        ix2= s_to_i[ch2]
        # print(i_to_s[ix1], i_to_s[ix2])
        N[ix1,ix2]+=1
        

In [None]:
# let us visualize the bigram counts
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        ch_str = i_to_s[i] + i_to_s[j]
        plt.text(j, i, ch_str, ha='center', va='bottom', color='grey')
        plt.text(j, i, N[i,j].item(), ha='center', va='top', color='grey')
plt.axis('off')

In [32]:
P = (N+1).float()  # convert to float for probability calculations
P /= P.sum(1, keepdim=True)  # normalize to make it a probability distribution
#broadcasting is allowed here (for sum) as 
# 27, 27
# 27, 1

#what is we have keepdim=False? Then P.sum(1) will return a 1D tensor of 1 by 27
# 27, 27  _______>    27, 27 __________> it broadcastes this one onto the 27 columns, # so columns are normalized instead of the rows
#     27               1, 27              

In [33]:
g = torch.Generator().manual_seed(23651)
for i in range(10):
    index=0
    chr_str=""
    while True:
        p_row = P[index]
        # p_row= N[index].float()  # probabilities of the next character given the previous one
        # p_row /= p_row.sum()  # normalize to make it a probability distribution
        
        #Sample from the distribution
        index = torch.multinomial(p_row, num_samples=1,replacement=True, generator=g).item()  # sample one index from the distribution

        curr_chr= i_to_s[index]  # get the character corresponding to the index
        chr_str += curr_chr  # append the character to the string
        if index == 0:  # if the index is 0, it means we have reached the end of the string
            break
    print(chr_str)  # print the generated string
        

fqza#
qczhatu#
losu#
fgiy#
adeleshoneeonu#
ton#
zbumoren#
mlttodho#
t#
caganotonbolvwdradwne#


In [34]:
# Evaluate the quality of the model
 
log_likelihood = 0.0
n=0
for w in ["sehaj"]:
    chs = ['#'] + list(w) + ['#']
    for ch1, ch2 in zip(chs,chs[1:]):
        ix1= s_to_i[ch1]
        ix2= s_to_i[ch2]
        prob = P[ix1, ix2]  # get the probability of the next character given the previous one
        log_prob = torch.log(prob)  # take the log of the probability
        log_likelihood += log_prob  # accumulate the log probability
        n+=1
        print(f'{ch1}{ch2} -> {prob:.4f} ({log_prob:.4f})')  # print the character pair and its probability
print(f'Log likelihood of the model: {log_likelihood.item()}')  # print the log likelihood of the model
nll=-log_likelihood/n  # negative log likelihood
print(f'Negative log likelihood: {nll.item()}')  # print the negative log likelihood

#s -> 0.0627 (-2.7691)
se -> 0.0609 (-2.7990)
eh -> 0.0133 (-4.3175)
ha -> 0.2151 (-1.5369)
aj -> 0.0060 (-5.1090)
j# -> 0.0222 (-3.8067)
Log likelihood of the model: -20.338119506835938
Negative log likelihood: 3.3896865844726562


In [35]:
# SAMPLING FROM THE MODEL

# Since N saves the counts of the bigrams, we first need to convert these counts to probabilities.

p = N[0].float() # probabilities of the first character given the start token
p /= p.sum() # normalize to make it a probability distribution
p


tensor([0.0000, 0.1154, 0.1231, 0.0654, 0.0385, 0.0038, 0.0192, 0.0346, 0.0308,
        0.0154, 0.0231, 0.0423, 0.0385, 0.1038, 0.0385, 0.0115, 0.0615, 0.0077,
        0.0231, 0.0654, 0.0654, 0.0038, 0.0346, 0.0154, 0.0000, 0.0077, 0.0115])

In [None]:
# we will  now sample from p using multinomial distribution
g = torch.Generator().manual_seed(23651)
# p = torch.rand(3, generator=g)
# p = p/ p.sum()  # normalize to make it a probability distribution
# print(p)
ix = torch.multinomial(p, num_samples=1,replacement=True, generator=g).item()  # sample one index from the distribution
i_to_s[ix]  # convert the index to character

In [36]:
import torch.nn.functional as F

In [37]:
# NEURAL NETWORK APPROACH
# We will use a neural network to predict the next character given the previous one.

#First let's create the training data of the bigrams (x,y):
x_train, y_train = [], []

for w in input_data[:1]:
    chs=['#'] + list(w) + ['#']           
    for ch1,ch2 in zip(chs, chs[1:]):
        ix1= s_to_i[ch1] 
        ix2= s_to_i[ch2]
        x_train.append(ix1)
        y_train.append(ix2)

x_train = torch.tensor(x_train)
y_train = torch.tensor(y_train)
num = x_train.nelement()
print('Number of examples: ', num)


#INITIALIZING THE NETWORK
#randomly initializing 27 neurons' weights. Each neuron receives 27 inputs
g = torch.Generator().manual_seed(12345)
W= torch.randn((no_distinct_chars,no_distinct_chars), generator=g, requires_grad=True) #27x1

Number of examples:  8


In [38]:

# GRADIENT DESCENT
epochs = 100
lr= 10

for epoch in range(epochs):

    #forward pass:
    x_enc= F.one_hot(x_train, num_classes=no_distinct_chars).float() #input to the network : one-hot encoding
    logits = x_enc @ W # log-counts
    counts = logits.exp() #Equivalent N
    prob = counts / counts.sum(1, keepdim=True)  # normalize to make it a probability distribution (softmax)
    loss = -prob[torch.arange(x_enc.shape[0]), y_train].log().mean() # negative log likelihood loss
    loss += 0.01*(W**2).mean() #Regularization Loss
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

    # backward pass
    W.grad= None  # reset the gradients
    loss.backward()

    # update the weights
    W.data += -lr * W.grad  # gradient descent step

Epoch 1/100, Loss: 3.943207025527954
Epoch 2/100, Loss: 2.7272212505340576
Epoch 3/100, Loss: 1.7540009021759033
Epoch 4/100, Loss: 1.0856691598892212
Epoch 5/100, Loss: 0.7010617256164551
Epoch 6/100, Loss: 0.5121611952781677
Epoch 7/100, Loss: 0.4181542992591858
Epoch 8/100, Loss: 0.3650933504104614
Epoch 9/100, Loss: 0.3315322995185852
Epoch 10/100, Loss: 0.3085062503814697
Epoch 11/100, Loss: 0.29176610708236694
Epoch 12/100, Loss: 0.2790646553039551
Epoch 13/100, Loss: 0.26910707354545593
Epoch 14/100, Loss: 0.2610972225666046
Epoch 15/100, Loss: 0.25451838970184326
Epoch 16/100, Loss: 0.24902157485485077
Epoch 17/100, Loss: 0.24436213076114655
Epoch 18/100, Loss: 0.24036401510238647
Epoch 19/100, Loss: 0.23689690232276917
Epoch 20/100, Loss: 0.23386260867118835
Epoch 21/100, Loss: 0.23118562996387482
Epoch 22/100, Loss: 0.22880707681179047
Epoch 23/100, Loss: 0.22668024897575378
Epoch 24/100, Loss: 0.2247675210237503
Epoch 25/100, Loss: 0.22303856909275055
Epoch 26/100, Loss: 0.2

In [39]:
x_enc.shape

torch.Size([8, 27])

In [40]:
plt.imshow(x_enc, cmap='Blues')

NameError: name 'plt' is not defined

In [41]:
nlls = torch.zeros(x_enc.shape[0])
for i in range(x_enc.shape[0]):
    # i-th bigram:
    x = x_train[i].item() #input character index
    y = y_train[i].item() #output character index
    print("---------------")
    print(f' bigram example {i+1}: {i_to_s[x]}{i_to_s[y]} (indices: {x}, {y})')
    print(f'input to the neural net:{x}')
    print('output probabilities from the neural net:',prob[i])
    print('label (actual next character):', y)
    p = prob[i,y]
    print('probability assigned by the net to the next character :', p.item())
    logp= torch.log(p)  # log probability
    print("log likelihood : ", logp.item())
    nll=-logp  # negative log likelihood
    print("negative log likelihood : ", nll.item())
    nlls[i] = nll  # store the negative log likelihood

print("======================================")
print("avergage negative log likelihood:", nlls.mean().item())

---------------
 bigram example 1: #a (indices: 0, 1)
input to the neural net:0
output probabilities from the neural net: tensor([7.0330e-05, 9.9138e-01, 1.6404e-05, 3.6166e-04, 3.3996e-04, 2.5487e-04,
        3.0555e-04, 8.4465e-05, 9.6687e-04, 8.5261e-04, 4.5826e-04, 1.0724e-04,
        2.1680e-04, 4.1186e-04, 5.0365e-04, 2.9817e-04, 5.1224e-04, 9.6389e-05,
        2.5768e-04, 6.2985e-04, 1.0769e-04, 1.6356e-04, 3.5822e-04, 4.6327e-05,
        6.1584e-04, 9.6876e-05, 4.8372e-04], grad_fn=<SelectBackward0>)
label (actual next character): 1
probability assigned by the net to the next character : 0.9913830161094666
log likelihood :  -0.00865432433784008
negative log likelihood :  0.00865432433784008
---------------
 bigram example 2: ab (indices: 1, 2)
input to the neural net:1
output probabilities from the neural net: tensor([4.9316e-04, 3.9587e-04, 4.9588e-01, 3.7766e-04, 2.7098e-04, 2.0364e-04,
        2.0550e-04, 2.7974e-04, 3.0508e-04, 2.3124e-04, 3.6907e-04, 2.6861e-04,
        6.

In [None]:
# Sampling from the neural net model
g = torch.Generator().manual_seed(23651)
W = torch.randn((no_distinct_chars, no_distinct_chars), generator=g, requires_grad=True)  # reinitialize W for sampling

for i in range(10):

    out =[]
    index=0
    while True:

        x_enc= F.one_hot(torch.tensor([index]), num_classes=no_distinct_chars).float()
        logits = x_enc @ W  # log-counts
        counts = logits.exp() # Equivalent N
        prob = counts / counts.sum(1, keepdim=True)  # normalize to make it a probability distribution (softmax)
        
        index = torch.multinomial(prob, num_samples=1,replacement=True, generator=g).item()  # sample one index from the distribution
        curr_chr= i_to_s[index]  # get the character corresponding to the index
        out.append(curr_chr)  # append the character to the string

        if index == 0:  # if the index is 0, it means we have reached the end of the string
            break
    print(''.join(out))  # print the generated string



    