In [30]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

# Simple Neural Langauge Model 

In [14]:
sentences = ['mohit likes icecream', 'Starks were cool', 'life is fool', 'batman is ironman']

In [15]:
# we will try to predict next word using NLM 

# step one : Tokenization 

vocab = {} # map from word type to index 
inputs = [] # stores an indexified version of each sentences 


for sent in sentences:
    sent_idxes = []
    sent = sent.split() #tokenize w/ whitespace 
    
    for w in sent: 
        if w not in vocab: 
            vocab[w] = len(vocab) # add new type to the vocab 
            
        sent_idxes.append(vocab[w])
    
    inputs.append(sent_idxes)

In [16]:
print(vocab)

{'mohit': 0, 'likes': 1, 'icecream': 2, 'Starks': 3, 'were': 4, 'cool': 5, 'life': 6, 'is': 7, 'fool': 8, 'batman': 9, 'ironman': 10}


In [17]:
print(inputs)

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 7, 10]]


In [18]:
import torch 

# 1 convert to long tensors 
# 2 define inpts and outpts
prefixes = torch.LongTensor([dream[:-1] for dream in inputs])
#print(prefixes)
labels = torch.LongTensor([dream[-1] for dream in inputs])
print('Prefix',prefixes,'Labels', labels)

Prefix tensor([[0, 1],
        [3, 4],
        [6, 7],
        [9, 7]]) Labels tensor([ 2,  5,  8, 10])


In [19]:
# onto defining the network 
import torch.nn as nn

class NLM(nn.Module):
    # important things 
    # 1. write init functions (initializes all teh paarameters of the network )
    # 2. forward functions (defines the forward propagation computations)
    
    
    def __init__(self, d_embedding, d_hidden, window_size, len_vocab):
        super(NLM, self).__init__() # init the base module class 
        self.d_emb = d_embedding
        self.embeddings = nn.Embedding(len_vocab, d_embedding)
        
        # concatenated embeddings > hidden
        self.W_hid = nn.Linear(d_embedding * window_size, d_hidden)
        
        # hidden > output probability distribution over vocab 
        self.W_out = nn.Linear(d_hidden, len_vocab)
        
        
    def forward(self, input):
        batch_size, window_size = input.size()
        embs = self.embeddings(input)
        #print('Embedding size',embs.size())
        
        # next we want to concatenate the prefix emveddings together
        concat_embs = embs.view(batch_size, window_size * self.d_emb)
        #print('concat_embs', concat_embs.size())
        
        #print(embs[0])
        #print(concat_embs[0])

        # now we project thsi to the hidden space 
        hiddens = self.W_hid(concat_embs)
        
#        print('hidden size:',hiddens.size())
        
        
        # finally project hiddens to vocabulary space 
        out = self.W_out(hiddens)
 #       print(out.size())
        
        return out # return unnormalized probability also known as **logits** 
        
        #probs = nn.functional.softmax(out, dim=1)
        #print(probs)
network = NLM(d_embedding=5, d_hidden=12, window_size=2, len_vocab=len(vocab))
    

In [20]:
print(network)

NLM(
  (embeddings): Embedding(11, 5)
  (W_hid): Linear(in_features=10, out_features=12, bias=True)
  (W_out): Linear(in_features=12, out_features=11, bias=True)
)


In [21]:
logits = network(prefixes)
print(logits)

tensor([[-0.9148, -0.3845, -0.4009,  0.6939,  0.1451, -0.4445, -0.0334,  0.0610,
          0.1489,  0.5961, -0.3777],
        [-0.4711, -0.4002,  0.4270,  0.7673,  0.1619, -1.0607,  0.4779,  0.5755,
          0.2724,  0.4126,  0.1768],
        [ 0.0620,  0.4257,  0.1212, -0.1615, -0.2869, -0.2723,  0.4295, -0.6903,
         -0.1328,  0.1644, -0.5329],
        [ 0.3097,  0.4308, -0.3435,  0.1234, -0.3655, -0.2046,  0.1777, -0.5831,
         -0.1630,  0.1311, -0.1915]], grad_fn=<AddmmBackward0>)


In [22]:
num_epochs = 100 
learning_rate = 0.1 
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(params = network.parameters(), lr=learning_rate)

# training loop 
for i in range(num_epochs):
    logits = network(prefixes)
    loss = loss_fn(logits, labels)

    #Step1 compute gradient 
    loss.backward()
    # step 2 update params using gradient descent 
    optimizer.step()
    
    # zero the gradients for next epoch
    optimizer.zero_grad()
    print(f'Epoch {i}, loss {loss}')


Epoch 0, loss 2.8989551067352295
Epoch 1, loss 2.6712656021118164
Epoch 2, loss 2.4671237468719482
Epoch 3, loss 2.2776453495025635
Epoch 4, loss 2.096468210220337
Epoch 5, loss 1.9193052053451538
Epoch 6, loss 1.7437639236450195
Epoch 7, loss 1.5692832469940186
Epoch 8, loss 1.3970167636871338
Epoch 9, loss 1.2295053005218506
Epoch 10, loss 1.0700929164886475
Epoch 11, loss 0.9222098588943481
Epoch 12, loss 0.7887284755706787
Epoch 13, loss 0.671504557132721
Epoch 14, loss 0.5711408853530884
Epoch 15, loss 0.4870171546936035
Epoch 16, loss 0.4175768494606018
Epoch 17, loss 0.36074867844581604
Epoch 18, loss 0.31434327363967896
Epoch 19, loss 0.27633175253868103
Epoch 20, loss 0.24498510360717773
Epoch 21, loss 0.2189067304134369
Epoch 22, loss 0.19700345396995544
Epoch 23, loss 0.17843155562877655
Epoch 24, loss 0.1625422090291977
Epoch 25, loss 0.1488344669342041
Epoch 26, loss 0.1369183361530304
Epoch 27, loss 0.12648789584636688
Epoch 28, loss 0.11729999631643295
Epoch 29, loss 0.1

In [23]:
# is it working?
# reverse vocabulary mapping (idx> word type)

rev_vocab =dict((idx, word) for (word, idx) in vocab.items()) 

In [24]:
rev_vocab

{0: 'mohit',
 1: 'likes',
 2: 'icecream',
 3: 'Starks',
 4: 'were',
 5: 'cool',
 6: 'life',
 7: 'is',
 8: 'fool',
 9: 'batman',
 10: 'ironman'}

In [25]:
mohitlikes = prefixes[0].unsqueeze(0)
logits = network(mohitlikes)
#print(logits)
prob = nn.functional.softmax(logits, dim=1).squeeze()

print(prob)

tensor([1.1911e-04, 4.1115e-04, 9.9087e-01, 6.7657e-04, 1.0363e-03, 8.8313e-04,
        6.1826e-04, 4.6378e-04, 4.2427e-03, 6.6321e-04, 1.2735e-05],
       grad_fn=<SqueezeBackward0>)


In [26]:
argmax_idx = torch.argmax(prob).item()
print('given mohit likes model predicts "%s" as the next word with %0.4f probability '%(rev_vocab[argmax_idx], prob[argmax_idx]))

given mohit likes model predicts "icecream" as the next word with 0.9909 probability 


In [27]:
lifeis = prefixes[2].unsqueeze(0)
logits = network(lifeis)
#print(logits)
prob = nn.functional.softmax(logits, dim=1).squeeze()

print(prob)
argmax_idx = torch.argmax(prob).item()
print('given "life is" model predicts "%s" as the next word with %0.4f probability '%(rev_vocab[argmax_idx], prob[argmax_idx]))

tensor([3.9025e-04, 8.2830e-04, 2.1691e-03, 3.8926e-04, 2.0898e-04, 7.0744e-05,
        1.0003e-03, 3.2197e-04, 9.8431e-01, 3.3799e-04, 9.9774e-03],
       grad_fn=<SqueezeBackward0>)
given "life is" model predicts "fool" as the next word with 0.9843 probability 
