## IMDb Sentiment Analysis using LSTM PyTorch

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

## Import required modules

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

import functools
import sys

import datasets
import matplotlib.pyplot as plt
import numpy as np

## Set Seed and enable GPU

In [3]:
torch.manual_seed(42) #its good practice to add a seed value

<torch._C.Generator at 0x7ed97839c950>

### Important point to note

To switch to CUDA(GPU), follow the below steps:
- Click on Runtime
- Click on `Change runtime type`
- Select Hardware Accelerator as `T4 GPU`

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
device

device(type='cuda')

## Download the Dataset using datasets library by HuggingFace

In [61]:
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

## Torchtext

Torchtext is a library made for NLP lovers. This contains most of the pre-processing required for Text data

## Tokenize the sequences

In [62]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [63]:
def tokenize_data(example, tokenizer, max_length):
    tokens = tokenizer(example['text'])[:max_length]
    length = len(tokens)
    return {'tokens': tokens, 'length': length}

In [64]:
max_length = 256
train_token = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})
test_token = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})

## Before:

Notice that dataset only contains text and label

In [65]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

## After

Notice that now we have tokens and length appended to the data

In [66]:
train_token

Dataset({
    features: ['text', 'label', 'tokens', 'length'],
    num_rows: 25000
})

In [67]:
print(train_token['text'][:500])



In [68]:
print(train_token['label'][:500])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [69]:
print(train_token['tokens'][:500])



## Split the train data after the tokenization to avoid data leakage

In [71]:
train_valid_data = train_token.train_test_split(test_size = 0.2)

In [72]:
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [73]:
vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],
                                                  specials=['<UNK>', '<PAD>'],
                                                  min_freq= 10)

In [74]:
vocab['<UNK>']

0

In [75]:
vocab.set_default_index(0)

### Note: Why do we need UNK and PAD?

Let's say we have a large corpus of text data. During tokenization we usually fit in all the train data. When we have a new text, if the model encounters a new word, it will assign it as <UNK>, which stands for unknown.

Let's take a few sample movie reivews:
- I loved this movie
- Amazing
- Impressive storyline
- Terrible experience not recommended to watch

If you look at the above statements, all have different word sizes. To ensure we pass the model with the same size, we pad all the sentences to be in the same size. We set the max length to be some value if the sequence has more than the threshold it truncates the padding. If it is less than the sequence, it pad and fill the sequence with zero.

## Prepare the dataset for the model

In [76]:
def convert_into_tokens(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

In [78]:
#this data will be used for training
train_data = train_data.map(convert_into_tokens, fn_kwargs={'vocab': vocab})
# this data will be used for evaluation
valid_data = valid_data.map(convert_into_tokens, fn_kwargs={'vocab': vocab})
# this the data that we use for generalization [New unseen data for testing]
test_data = test_token.map(convert_into_tokens, fn_kwargs={'vocab': vocab})

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## Look now we have `ids` as the next data appended

#### Token is the individual representation of the word and the ids is the equivalent numerical value to that token

In [59]:
train_data['ids'][:10]

[[12,
  311,
  5,
  174,
  706,
  7,
  14,
  242,
  18,
  74,
  38,
  308,
  15,
  2,
  23,
  3467,
  28,
  21,
  106,
  210,
  6,
  15,
  11,
  17,
  650,
  71,
  0,
  675,
  3,
  12,
  939,
  15,
  432,
  4,
  2,
  23,
  277,
  28,
  5,
  236,
  106,
  210,
  25,
  0,
  68,
  31,
  88,
  211,
  3,
  3,
  3,
  24,
  22,
  12,
  474,
  94,
  29,
  109,
  11,
  0,
  2,
  860,
  579,
  3,
  1199,
  1326,
  9,
  0,
  769,
  1104,
  6,
  577,
  13094,
  96,
  1767,
  4688,
  4,
  4784,
  6,
  1475,
  2901,
  30,
  128,
  2102,
  13,
  5,
  14473,
  1365,
  3,
  2,
  18,
  314,
  100,
  65,
  13,
  842,
  90,
  1326,
  10994,
  6635,
  25,
  4580,
  0,
  9,
  16,
  6,
  2537,
  13107,
  9,
  16,
  126,
  30,
  0,
  13,
  15,
  579,
  24,
  22,
  99,
  13,
  11443,
  41,
  7823,
  7983,
  13,
  1326,
  9,
  6542,
  7,
  660,
  3,
  11,
  10,
  0,
  8,
  73,
  41,
  0,
  4,
  0,
  6,
  0,
  6835,
  433,
  146,
  5,
  9245,
  6,
  2623,
  7686,
  3,
  432,
  2,
  18,
  169,
  9,
  27,
  3588,


In [79]:
train_data = train_data.with_format(type='torch', columns=['ids', 'label', 'length'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'label', 'length'])
test_data = test_data.with_format(type='torch', columns=['ids', 'label', 'length'])

## Model building- LSTM

In [86]:
class LSTMmodel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,dropout_rate, pad_index):
        super().__init__()
        # layer 1- Pass the ids to the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        # layer 2- LSTM [If n_layers = 2, then layer 3 is also LSTM]
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate) # to avoid overfitting

    def forward(self, ids, batch_size):
        # token to embeddings
        embedded = self.dropout(self.embedding(ids))
        embdedded = nn.utils.rnn.pack_padded_sequence(embedded, batch_size, batch_first=True,enforce_sorted=False)
        # embedding sequence (batch_size,seq_length,emd_dim) to LSTM

        outputs, (hidden, cell) = self.lstm(embdedded)

        output, output_length = nn.utils.rnn.pad_packed_sequence(outputs)
        hidden = self.dropout(hidden[-1])

        prediction = self.fc(hidden)
        return prediction

In [103]:
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 64
output_dim = len(train_data.unique('label')) # either 0 or 1 = 2(length)
n_layers = 2
dropout_rate = 0.5

model = LSTMmodel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout_rate,vocab['<PAD>'])
model = model.to(device) #switch our modeling training in GPU

In [98]:
sum(p.numel() for p in model.parameters() if p.requires_grad) # total parameters

1945474

In [99]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [100]:
model.apply(initialize_weights)

LSTMmodel(
  (embedding): Embedding(14550, 128, padding_idx=1)
  (lstm): LSTM(128, 64, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

## Using Pre-trained embeddings

Few of the most used Pre-trained embedding are:
- GloVe
- Word2Vec
- FastText

In [94]:
vectors = torchtext.vocab.GloVe() # extra pre-trained embebedding

.vector_cache/glove.840B.300d.zip: 2.18GB [06:51, 5.29MB/s]                            
100%|█████████▉| 2196016/2196017 [05:31<00:00, 6631.53it/s]


In [95]:
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [96]:
model.embedding.weight.data = pretrained_embedding

## Compile Model

- Three important parameter that influence the model are:
  - Optimizer- algorithm for gradient descent [Adam, SGD, RMSProp]
  - Loss function- Binary cross entropy loss or CrossEntropy loss
  - Evaluation performance metrics [Accuracy, Precision, Recall]

In [120]:
learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss().to(device)

def metrics(prediction, actual):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(actual).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [111]:
def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = [i['length'] for i in batch]
    batch_length = torch.stack(batch_length)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids': batch_ids,
             'length': batch_length,
             'label': batch_label}
    return batch

## Fit the data to the model

In [121]:
batch_size = 64
collate = functools.partial(collate, pad_index= vocab['<PAD>'])

train_dataloader = torch.utils.data.DataLoader(train_data,
                                               batch_size=batch_size,
                                               collate_fn=collate,
                                               shuffle=True)

valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

## Train the model

In [124]:
def train(dataloader, model, loss_function, optimizer, device):
    model.train()

    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)

        #batch length - dataloader
        length = batch['length']
        label = batch['label'].to(device)

        # y_hat = prediction from the model
        prediction = model(ids, length)
        # loss function - > Actual value, predicted value
        # actual value - label
        # predicted value is prediction
        loss = loss_function(prediction, label) #loss

        accuracy = metrics(prediction, label) #
        optimizer.zero_grad() #adam -> gradient descent

        loss.backward()
        optimizer.step() #we will update the weights with learning rate

        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

## Evaluation

In [126]:
def evaluate(dataloader, model, loss_function, device):

    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad(): # no optimization -> no update in weightds
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = loss_function(prediction, label)
            accuracy = metrics(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [130]:
n_epochs = 5
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, loss_function, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, loss_function, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)

    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)

    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'lstm.pt')

    print(f'Epoch: {epoch+1}/{n_epochs}')
    print(f'loss: {epoch_train_loss:.4f}, accuracy: {epoch_train_acc:.4f}')
    print(f'valid_loss: {epoch_valid_loss:.4f}, valid_accuracy: {epoch_valid_acc:.4f}')
    print("--"*25)

training...: 100%|██████████| 313/313 [00:09<00:00, 31.50it/s]
evaluating...: 100%|██████████| 79/79 [00:00<00:00, 86.78it/s]
Epoch: 1/5
loss: 0.6610, accuracy: 0.6089
valid_loss: 0.6248, valid_accuracy: 0.6715
--------------------------------------------------
training...: 100%|██████████| 313/313 [00:10<00:00, 31.22it/s]
evaluating...: 100%|██████████| 79/79 [00:00<00:00, 87.75it/s]
Epoch: 2/5
loss: 0.6485, accuracy: 0.6321
valid_loss: 0.6228, valid_accuracy: 0.6622
--------------------------------------------------
training...: 100%|██████████| 313/313 [00:10<00:00, 29.27it/s]
evaluating...: 100%|██████████| 79/79 [00:01<00:00, 54.08it/s]
Epoch: 3/5
loss: 0.6367, accuracy: 0.6457
valid_loss: 0.6678, valid_accuracy: 0.6503
--------------------------------------------------
training...: 100%|██████████| 313/313 [00:11<00:00, 26.81it/s]
evaluating...: 100%|██████████| 79/79 [00:01<00:00, 60.53it/s]
Epoch: 4/5
loss: 0.6598, accuracy: 0.6246
valid_loss: 0.6359, valid_accuracy: 0.6493
---

## Evaluation of the model

In [131]:
model.load_state_dict(torch.load('lstm.pt')) #save the models

test_loss, test_acc = evaluate(test_dataloader, model, loss_function, device)

epoch_test_loss = np.max(test_loss)
epoch_test_acc = np.max(test_acc)

print("Loss",epoch_test_loss)
print("Acc",epoch_test_acc)

evaluating...: 100%|██████████| 391/391 [00:05<00:00, 76.77it/s]
Loss 0.811225414276123
Acc 0.921875


## Predictions on User Input

In [132]:
def make_prediction(text, model, tokenizer, vocab):
    #find the token for the user input
    tokens = tokenizer(text)
    #convert token into numerical number (unique id)
    ids = [vocab[t] for t in tokens]

    #find the length and convert the ids into tensor to feed in LSTM model
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)

    #make prediction
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1) #check for the score - probability (softmax)

    predicted_class = prediction.argmax(dim=-1)
    predicted_probability = probability[predicted_class]
    return predicted_class, predicted_probability

In [137]:
def display(label,score):
    if label==0:
        print(f"Negative-Score:{score}")
    else:
        print(f"Positive-Score:{score}")

In [138]:
text = "Amazing movie, loved it"
label,score = make_prediction(text, model, tokenizer, vocab)
display(label,score)

Positive-Score:0.547923743724823
