## CS60075 - Assignment - 1
- Name : Debanjan Saha
- Roll : 19CS30014

## Neural Network based Language Model

In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_
import math

In [2]:
# directory for saving model
!mkdir -p "saved_model"

In [3]:
def load_dataset():
    """
    Load dataset from the folder 'data'
    """
    with open("./data/train.txt", 'r') as f:
        train_data = [line.strip() for line in f.readlines()]

    with open("./data/test.txt", 'r') as f:
        test_data = [line.strip() for line in f.readlines()]

    return train_data, test_data

In [4]:
train_sents, test_sents = load_dataset()

In [5]:
train_sents[:3]

['liberty all star usa sets initial payout',
 'we are being accused of not implementing this agreement',
 'entregrowth closed at 135 dlrs and options at 55 cents']

In [6]:
test_sents[:3]

['the company said each debenture is convertible into shares of businessland common stock at a conversion price of 2050 dlrs',
 'sumita says he does not expect further dollar fall',
 'the tin price is likely to rise to 20 ringgit a kilo this year because of the producers accord on export quotas and the reluctance of brokers and banks to sell the metal at lower prices a malaysian government bulletin said']

In [24]:
# Hyper-parameters (Obtained after tuning them)
hyperparams = {
    "embed_size" : 128,
    "hidden_size" : 1024,
    "num_layers" : 1,
    "num_epochs" : 5,
    "batch_size" : 20,
    "seq_length" : 30,
    "learning_rate" : 0.001,
}

In [25]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

In [26]:
class CustomDataset:
    """
    Custom Dataset class to 
    - split train data into 90:10 train-dev split
    - batchify train, test, dev data
    - generate corpus using a word2vec model 
    """
    def __init__(self, train_sents, test_sents, batch_size=hyperparams["batch_size"]):
        self.train_sents = train_sents
        self.test_sents = test_sents
        self.batch_size = batch_size
        self.generate_corpus()
        
    def generate_corpus(self):
        """
        Generates the corpus by tokenizing the words in each sentence,
        Makes a word2vec model on the tokens and generates a pretrained word2vec embedding for the corpus
        """

        data = []
        # iterate through each sentence in the training sentences
        for sentence in self.train_sents:
            cur_sent = []
            # tokenize the sentence into words
            for word in sentence.split():
                cur_sent.append(word.lower())
            data.append(cur_sent)

        self.w2v_model = gensim.models.Word2Vec(data, min_count=1, vector_size=hyperparams["embed_size"], window=5, workers=4)
        self.word2idx = self.w2v_model.wv.key_to_index
        self.idx2word = self.w2v_model.wv.index_to_key
        self.vocab_size = len(self.word2idx)
        
        num_tokens = sum([len(words_array) for words_array in data])
        self.ids = torch.LongTensor(num_tokens)
        
        index = 0
        for sentence in data:
            for word in sentence:
                self.ids[index] = self.word2idx[word]
                index += 1
        
    def create_batch(self, array, batch_size):
        """
        creates batches of input array in given batch_size 
        """
        batched_total_size = (array.size(0) // batch_size) * batch_size 
        array = array[:batched_total_size]
        return array.view(batch_size, -1)

    def get_batched_train_and_dev_data(self, train_split=0.9):
        """
        splits the train data into 90:10 ratio for train-dev set
        batchifies both train and dev data for future use
        """
        train_len = int(train_split * self.ids.size(0))
        train_data, dev_data = self.ids[:train_len], self.ids[train_len:]
        train_data, dev_data = self.create_batch(train_data, self.batch_size), self.create_batch(dev_data, self.batch_size)
        return train_data, dev_data 

    def get_batched_test_data(self):
        """
        generates test data tokens and batchifies the data for testing the model
        """
        test_words = []
        for sent in self.test_sents:
            for word in sent.split():
                if word in self.word2idx:
                    test_words.append(word.lower())

        test_data = torch.LongTensor(len(test_words))
        index = 0
        for word in test_words:
            test_data[index] = self.word2idx[word]
            index += 1

        test_data = self.create_batch(test_data, self.batch_size)
        return test_data

    def get_vocab_size(self):
        """
        returns vocab size for the dataset
        """
        return self.vocab_size

In [27]:
dataset = CustomDataset(train_sents, test_sents, batch_size=hyperparams["batch_size"])

In [28]:
train_data, dev_data = dataset.get_batched_train_and_dev_data(train_split=0.9)

In [29]:
train_data.shape, dev_data.shape

(torch.Size([20, 60649]), torch.Size([20, 6738]))

In [30]:
test_data = dataset.get_batched_test_data()

In [31]:
test_data.shape

torch.Size([20, 16507])

In [32]:
dataset.get_vocab_size()

44689

In [33]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [34]:
print(dataset.get_vocab_size())

44689


In [35]:
pretrained_w2v_embeddings = torch.FloatTensor(dataset.w2v_model.wv.vectors)

In [36]:
# RNN based language model
class RNNLanguageModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, vocab_size):
        super(RNNLanguageModel, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_w2v_embeddings)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, h):
        # Embed word ids to vectors
        x = self.embed(x)
        
        # Forward propagate LSTM
        out, (h, c) = self.lstm(x, h)
        
        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        
        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)

In [37]:
def detach(states):
    return [state.detach() for state in states]

In [38]:
model = RNNLanguageModel(
    embed_size=hyperparams["embed_size"], 
    hidden_size=hyperparams["hidden_size"], 
    num_layers=hyperparams["num_layers"],
    vocab_size=dataset.get_vocab_size(),
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])

In [39]:
seq_length = hyperparams["seq_length"]
batch_size = hyperparams["batch_size"]
num_layers = hyperparams["num_layers"]
hidden_size = hyperparams["hidden_size"]
num_epochs = hyperparams["num_epochs"]

In [40]:
best_loss = math.inf

num_train_batches = train_data.size(1) // seq_length
num_dev_batches = dev_data.size(1) // seq_length

# Train the model
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
    

    print("Currently Running: Training Set")
    model.train()
    for i in range(0, train_data.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = train_data[:, i:i+seq_length].to(device)
        targets = train_data[:, (i+1):(i+1)+seq_length].to(device)
        
        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_train_batches, loss.item(), np.exp(loss.item())))

    total_loss = 0.0

    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
    
    print("Currently Running: Development Set")
    model.eval()
    for i in range(0, dev_data.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = dev_data[:, i:i+seq_length].to(device)
        targets = dev_data[:, (i+1):(i+1)+seq_length].to(device)

        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))

        total_loss += loss.item()

        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                    .format(epoch+1, num_epochs, step, num_dev_batches, loss.item(), np.exp(loss.item())))

    # print(f"Total Loss: {total_loss}, best_loss: {best_loss}")

    if total_loss < best_loss:
        best_loss = total_loss
        print("Better Loss! Saving Model...")
        # Save the model checkpoints
        torch.save(model.state_dict(), 'saved_model/rnn_model.ckpt')

Currently Running: Training Set
Epoch [1/5], Step[0/2021], Loss: 10.7076, Perplexity: 44692.52
Epoch [1/5], Step[100/2021], Loss: 7.1698, Perplexity: 1299.60
Epoch [1/5], Step[200/2021], Loss: 6.1981, Perplexity: 491.84
Epoch [1/5], Step[300/2021], Loss: 6.1978, Perplexity: 491.67
Epoch [1/5], Step[400/2021], Loss: 5.9910, Perplexity: 399.81
Epoch [1/5], Step[500/2021], Loss: 5.9888, Perplexity: 398.94
Epoch [1/5], Step[600/2021], Loss: 5.6104, Perplexity: 273.24
Epoch [1/5], Step[700/2021], Loss: 5.5426, Perplexity: 255.35
Epoch [1/5], Step[800/2021], Loss: 5.3890, Perplexity: 218.99
Epoch [1/5], Step[900/2021], Loss: 5.2950, Perplexity: 199.34
Epoch [1/5], Step[1000/2021], Loss: 5.2092, Perplexity: 182.95
Epoch [1/5], Step[1100/2021], Loss: 5.4622, Perplexity: 235.62
Epoch [1/5], Step[1200/2021], Loss: 5.3439, Perplexity: 209.33
Epoch [1/5], Step[1300/2021], Loss: 5.0160, Perplexity: 150.81
Epoch [1/5], Step[1400/2021], Loss: 5.4980, Perplexity: 244.21
Epoch [1/5], Step[1500/2021], L

In [41]:
# del model
# torch.cuda.empty_cache()

In [42]:
model.load_state_dict(torch.load("saved_model/rnn_model.ckpt"))
model.to(device)

RNNLanguageModel(
  (embed): Embedding(44689, 128)
  (lstm): LSTM(128, 1024, batch_first=True)
  (linear): Linear(in_features=1024, out_features=44689, bias=True)
)

In [43]:
# Test the model
with torch.no_grad():
    total_perplexity = 0.0
    sequence_counter = 0

    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
            torch.zeros(num_layers, batch_size, hidden_size).to(device))

    num_test_batches = test_data.size(1) // seq_length

    model.eval()
    for i in range(0, test_data.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = test_data[:, i:i+seq_length].to(device)
        targets = test_data[:, (i+1):(i+1)+seq_length].to(device)

        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))

        current_seq_perplexity = np.exp(loss.item())
        total_perplexity += current_seq_perplexity
        sequence_counter += 1

        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Step[{}/{}], Perplexity: {:5.2f}'.format(step, num_test_batches, current_seq_perplexity))
    
    print(f"Test Set Perplexity: {total_perplexity / sequence_counter}")

Step[0/550], Perplexity: 226.14
Step[100/550], Perplexity: 132.66
Step[200/550], Perplexity: 190.19
Step[300/550], Perplexity: 148.76
Step[400/550], Perplexity: 166.58
Step[500/550], Perplexity: 130.81
Test Set Perplexity: 154.42289450497304
