### This is an example of application of neural networks to build a language model.

In [37]:
import torch 
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms

'''move the computations to the GPU if cuda is available, otherwise the computations will be run on CPU'''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

'''defining model parameters'''
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000     # number of words to be sampled
batch_size = 20
seq_length = 10
learning_rate = 0.002

'''pre-process the data'''
class dictionary():
    def __init__(self):
        self.word_to_idx = {'eos':0}
        self.idx_to_word = {0:'eos'}
        self.vocab_size = 1
        self.token_ids = []
    def add_word(self, word):
        if word not in self.word_to_idx:
            self.word_to_idx[word] = len(self.word_to_idx)
            self.idx_to_word[len(self.idx_to_word)] = word
            self.vocab_size +=1
        self.token_ids.append(self.word_to_idx[word])
class document():
    def __init__(self):
        self.corpus = dictionary()
    def build_corpus(self):
        input_text = open('data/filter_text.txt').readlines()
        for i in range(len(input_text)):
            input_text[i] = input_text[i].strip()
            input_text[i] += ' eos '
        input_string = ''.join(input_text)
        for word in input_string.split(' '):
            self.corpus.add_word(word)
        return self.corpus
    
        
'''the dataset used is a sample of 500 reviews from Amazon Review Dataset. The reviews are cleaned during pre-processing.'''
corpus = document()
text_dict = corpus.build_corpus()

inputs = []
outputs = []
num_inputs = int(len(text_dict.token_ids)/seq_length)
for i in range(num_inputs):
    inputs.append(text_dict.token_ids[i*seq_length:(i+1)*seq_length])
    outputs.append(text_dict.token_ids[(i*seq_length)+1:((i+1)*seq_length)+1])
    
'''define the model: a recurrent NN'''
class LM(nn.Module):
    def __init__(self, vocab_size, num_layers, hidden_size, embed_size):
        super(LM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed_layer = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first = True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    def forward(self, input):
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size)
        out = self.embed_layer(input)
        out, _ = self.lstm(out, (h0, c0))
#         print(out.shape)
        #Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
#         print(out.shape)
        out = self.linear(out)
        return out

'''instantiate the model'''
model = LM(text_dict.vocab_size, num_layers, hidden_size, embed_size).to(device)

'''cross entropy is used as loss function'''
criterion = nn.CrossEntropyLoss()

'''Adam optimizer is used as the optimization function. We optimized all the model parameters, with a given learning rate.'''
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

'''training...'''
for epoch in range(num_epochs):
    for i in range(int(num_inputs/batch_size)):
        input_tensor = torch.LongTensor(np.array(inputs[i*batch_size:(i+1)*batch_size])).to(device)
        output_tensor = torch.LongTensor(np.array(outputs[i*batch_size:(i+1)*batch_size])).to(device)

        pred = model(input_tensor)
#         print(pred.shape, output_tensor.shape)
        '''
        output and pred are in the size of [batch_size*seq_len, vocab size], 
        which means for each word in the sequence of each batch, we generate a probability distribution over vocabulary
        ''' 
        
        loss = criterion(pred, output_tensor.reshape(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i%100==0:
            print('epoch [{}/{}], step [{}/{}], loss {:.4f}'.format(epoch+1, num_epochs, i+1, int(num_inputs/batch_size), loss))

epoch [1/5], step [1/159], loss 8.1972
epoch [1/5], step [101/159], loss 5.9496
epoch [2/5], step [1/159], loss 5.5469
epoch [2/5], step [101/159], loss 4.9154
epoch [3/5], step [1/159], loss 4.6518
epoch [3/5], step [101/159], loss 4.0222
epoch [4/5], step [1/159], loss 3.5502
epoch [4/5], step [101/159], loss 3.2595
epoch [5/5], step [1/159], loss 2.7021
epoch [5/5], step [101/159], loss 2.5975


In [48]:
'''sample a word from a dictionary randomly and get the next word'''
with torch.no_grad():
    prob = torch.ones(text_dict.vocab_size)
    input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)
    print(text_dict.idx_to_word[input.item()])
    output = model(input)
    _,output = torch.max(output, 1)
    print(text_dict.idx_to_word[output.item()])

bushel
box
