In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

This jupyter notebook is based on https://www.kaggle.com/code/gabrielloye/simple-lstm-using-pytorch/notebook

The article with additional explanation can be found [here](https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/)

# LSTM Sentiment Analysis on Product Reviews

For this jupyter notebook, we’ll be using the Amazon customer reviews dataset which can be found on [Kaggle](https://www.kaggle.com/bittlingmayer/amazonreviews). The dataset contains a total of 4 million reviews with each review labelled to be of either positive or negative sentiment. 

Our goal will be to create an LSTM model that can accurately classify and distinguish the sentiment of a review. To do so, we’ll have to start with some data-preprocessing, defining and training the model, followed by assessing the model.

For our data pre-processing steps, we'll be using *regex*, *numpy* and the *NLTK (Natural Language Toolkit)* library for some simple NLP helper functions. As the data is compressed in the *bz2* format, we'll use the Python *bz2* module to read the data.

**IMPORTANT:** it is highly recommended to use GPU in this notebook. Otherwise the training process will be much longer. Luckily, configuring GPU for pytorch is a lot easier than for Tensorflow. It is enough to execute one single command to install the GPU version of pytorch along with CUDA. See the instructions on https://pytorch.org/get-started/locally/#start-locally (choose your system, any version of CUDA and execute the generated command)

In [None]:
#!pip install nltk
#!pip install contractions

In [None]:
import bz2
from collections import Counter
import re
import nltk
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import contractions
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import wandb
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def read_file_lines(file, n):
    lines = []
    for i, line in enumerate(file):
        if i >= n:
            break
        lines.append(line)
    
    return lines

The original dataset contains a total of 4 million reviews - 3.6 million training and 0.4 million testing. However, with this notebook you get dataset with 300,000 train reviews and 60,000 test ones. We won't be using the entire dataset to save time. If you have the computing power and capacity, go ahead and train the model on a larger portion of data.

In [None]:
train_file = bz2.BZ2File('data/train.ft.txt.bz2')
test_file = bz2.BZ2File('data/test.ft.txt.bz2')

Feel free to adjust the number of training and testing reviews. Decrease if training is too long or increase if you have enough computational power.

In [None]:
num_train = 300000 # The more reviews, the longer the training, but the better the accuracy
num_test = 60000 

train_file = read_file_lines(train_file, num_train)
test_file = read_file_lines(test_file, num_test)

In [None]:
print("Number of training reivews: " + str(len(train_file)))
print("Number of test reviews: " + str(len(test_file)))

In [None]:
train_file = [x.decode('utf-8') for x in train_file[:num_train]]
test_file = [x.decode('utf-8') for x in test_file[:num_test]]

Now, lets look how the dataset looks like

In [None]:
print(train_file[0]) # customer did not like the product (1-2 stars review)
print(train_file[6]) # customer liked the product (4-5 stars review)

```__label__1``` corresponds to 1- and 2-star reviews, and ```__label__2``` corresponds to 4- and 5-star reviews.

The review titles, followed by ':' and a space, are prepended to the text.

Most of the reviews are in English, but there are a few in other languages, like Spanish.

Next, we'll have to extract out the labels from the sentences. The data is the format ```__label__1/2 <sentence>```, therefore we can easily split it accordingly. Positive sentiment labels are stored as 1 and negative are storedt cases.

In [None]:
# Extracting labels from sentences
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]

Now, lets clean the data. You have plenty of opportunities here. Try out the provided functions and find out what data cleaning methods work best for you 

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def remove_banned_words(text):
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Lemmatize words
def lemmatize(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Remove short words
def remove_short_words(text, min_len=2):
    words = text.split()
    long_words = [word for word in words if len(word) >= min_len]
    return ' '.join(long_words)

# Replace elongated words with their base form
def replace_elongated_words(text):
    regex_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    return re.sub(regex_pattern, r'\1\3\4', text)

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_urls(text):
    return re.sub(r"(?:\@|https?\://)\S+", "<url>", text)

# Function to call all the cleaning functions in the correct order
def clean_sentence(sentence):
    sentence = remove_urls(sentence)
    sentence = remove_numbers(sentence)
    sentence = remove_banned_words(sentence)
    # sentence = replace_elongated_words(sentence)
    # sentence = expand_contractions(sentence)
    # sentence = lemmatize(sentence)
    sentence = remove_punctuation(sentence)
    # sentence = remove_short_words(sentence)
    return sentence

In [None]:
# data cleaning
for i in range(len(train_sentences)):
    train_sentences[i] = clean_sentence(train_sentences[i])

for i in range(len(test_sentences)):
    test_sentences[i] = clean_sentence(test_sentences[i])

In [None]:
del train_file, test_file

After quickly cleaning the data, we will do tokenization of the sentences, which is a standard NLP task. 
Tokenization is the task of splitting a sentence into the individual tokens, which can be words or punctuation, etc.
There are many NLP libraries that are able to do this, such as *spaCy* or *Scikit-learn*, but we will be using *NLTK* here as it has one of the faster tokenizers.

The words will then be stored in a dictionary mapping the word to its number of appearances. These words will become our **vocabulary**.

In [None]:
words = Counter() # Dictionary that will map a word to the number of times it appeared in all the training sentences
for i, sentence in enumerate(train_sentences):
    # The sentences will be stored as a list of words/tokens
    train_sentences[i] = []
    for word in nltk.word_tokenize(sentence): # Tokenizing the words
        words.update([word.lower()]) # Converting all the words to lower case
        train_sentences[i].append(word)
    if i%15000 == 0:
        print(str(i/(num_train/100)) + "% done")
print("100% done")
words.most_common(10)

To remove typos and words that likely don't exist, we'll remove all words from the vocab that only appear once throughout.
To account for **unknown** words and **padding**, we'll have to add them to our vocabulary as well. Each word in the vocabulary will then be assigned an integer index and thereafter mapped to this integer.

In [None]:
# Removing the words that only appear once
words = {k:v for k,v in words.items() if v>1}

# Sorting the words according to the number of appearances, with the most common word being first
words = sorted(words, key=words.get, reverse=True)

# Adding padding and unknown to our vocabulary so that they will be assigned an index
words = ['_PAD','_UNK'] + words

# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

With the mappings, we'll convert the words in the sentences to their corresponding indexes.

In [None]:
for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(sentence)]

In the last pre-processing step, we'll be padding the sentences with 0s and shortening the lengthy sentences so that the data can be trained in batches to speed things up.

In [None]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [None]:
seq_len = 200 # The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

In [None]:
# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

A padded sentence will look something like this, where 0 represents the padding: 

In [None]:
test_sentences[0]

Our dataset is already split into *training* and *testing* data. However, we still need a set of data for validation during training. Therefore, we will split our test data by half into a validation set and a testing set. A detailed explanation on dataset splits can be found [here](https://machinelearningmastery.com/difference-test-validation-datasets/).

In [None]:
split_frac = 0.5
split_id = int(split_frac * len(test_sentences))
val_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

Next, this is the point where we’ll start working with the PyTorch library. We’ll first define the datasets from the sentences and labels, followed by loading them into a data loader. We set the batch size to 350. This can be tweaked according to your needs.

In [None]:
train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

# how many reviews are used in training in one iteration
batch_size = 400 # decrease this value if you get CUDA out of memory error during training

# create train_loaders to get data by batches from the whole dataset
# shuffle after every epoch
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

We can also check if we have any GPUs to speed up our training time by many folds. 


In [None]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print(sample_x.shape, sample_y.shape)

At this point, we will be defining the architecture of the model. At this stage, we can create a Neural Network that has deep layers or and large number of LSTM layers stacked on top of each other. However, a simple model such as the one below works quite well and requires much less training time. We will be training our own word embeddings in the first layer before the sentences are fed into the LSTM layer.

The final layer is a fully connected layer with a sigmoid function to classify whether the review is of positive/negative sentiment.

In [None]:
# import torch.nn as nn

class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.2, drop_prob_lstm=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob_lstm, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

Take note that we can actually load pre-trained word embeddings such as GloVe or fastText which can increase the model’s accuracy and decrease training time.

With this, we can instantiate our model after defining the arguments. The output dimension will only be 1 as it only needs to output 1 or 0. The learning rate, loss function and optimizer are defined as well.


In [None]:
vocab_size = len(word2idx) + 1 # extra 1 for padding
output_size = 1
embedding_dim = 86
hidden_dim = 256
drop_prob = 0.2
drop_prob_lstm = 0.5

# number of recurrent layers in the network
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob, drop_prob_lstm)

# load model to GPU memory, if GPU is available. Otherwise load to RAM memory
model.to(device)
print(model)

In [None]:
# function to calculate accuracy
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:
learning_rate = 0.002 # learning rate - determines the size of step in gradient descent 
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

clip = 5
epochs = 10 
valid_loss_min = np.Inf

Before we train our model lets start wandb session for monitoring model training. Before running the following block, you have to register on https://wandb.ai/site and get the token, then execute ```wandb login``` in terminal and pass the token there. The advantage of wandb is that you can track your hyperparameter tuning process and have all monitoring data in one place

In [None]:
wandb.init(
    # set the wandb project where this run will be logged, can be any name
    project="ISA-LSTM-notebook",
    
    # this info is only metadata, it does not have any influence on training
    # pass your hyperparameters here, so that you would have a complete report from your training
    config={
        "architecture": "SentimentNet",
        "dataset": "amazonreviews",
        "embedding_dim": embedding_dim,
        "hidden_dim": hidden_dim,
        "recurrent_layers": n_layers,
        "output_dimension": output_size,
        "learning_rate": learning_rate,
        "optimizer": "Adam",
        "batch_size": batch_size,
        "epochs": epochs,
        "clip_grad_norm": clip,
        "vocabulary_size": vocab_size,
        "sequence_length": seq_len,
        "drop_prob": drop_prob,
        "drop_prob_lstm": drop_prob_lstm,
        "train_data_rows": num_train,
        "test_data_rows": num_test,
        "validation_split_frac": split_frac
    }
)

Finally, we can start training the model. For every epoch, we’ll be checking the output of our model against the validation dataset and saving the model if it performed better than the previous epoch.
The state_dict is the model’s weights in PyTorch which can be loaded into a model with the same architecture at a separate time or script altogether.

This step can take a lot of time, so be sure you are using GPU. You can decrease the amount of data as well.

#### **TROUBLESHOOTING:** if you get CUDA memory related error, just decrease the batch size (see the previous steps to find **batch_size** variable) and restart kernel to free the GPU memory

In [None]:
# here we will store the statistical data representing training progress, 
# which will be later shown on graphs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

# train model in epochs
for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0

    # set model in training mode
    model.train()
    
    # initialize hidden state 
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)   
        
        # creating new variables for the hidden state, otherwise
        # we would backpropagate through the entire training history
        h = tuple([each.data for each in h])

        # clear the gradients from previous iteration
        model.zero_grad()
        output,h = model(inputs,h)
        
        # compute loss: how far is the prediction from the actual?
        loss = criterion(output.squeeze(), labels.float())

        # compute gradients for neurons (perform backpropagation)
        loss.backward()
        
        # calculating accuracy
        accuracy = acc(output,labels)

        # sending accuracy and loss to wandb for graph creation
        wandb.log({"Train Batch Accuracy": accuracy/batch_size, "Train Batch Loss": loss.item()})
        
        train_acc += accuracy
        train_losses.append(loss.item())
        
        # `clip_grad_norm` contributes to preventing the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)

        # update the weights
        optimizer.step()
 
    
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0

    # set model in evaluation mode
    model.eval()
    for inputs, labels in val_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())
            
            accuracy = acc(output,labels)
            val_acc += accuracy

    # calculate and print statistical data representing training progress
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(val_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    wandb.log({"Train Epoch Accuracy": epoch_train_acc, 
               "Train Epoch Loss": epoch_train_loss,
               "Validation Epoch Accuracy": epoch_val_acc,
               "Validation Epoch Loss": epoch_val_loss})
    
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), './state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    else:
        print(25*'==')
        print('Model started overtraining, stopping...')
        break
    print(25*'==')

wandb.finish()

In [None]:
fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Train Acc')
plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()
    
plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Train loss')
plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

After we’re done training, it's time to test our model on a dataset it has never seen before - our test dataset.
We'll first load the model weights from the point where the validation loss is the lowest.
We can calculate the accuracy of the model to see how accurate our model’s predictions are.

In [None]:
# loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

In [None]:
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze()) #rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))