# Training and Evaluating Sentiment Analysis on IMDB Dataset with PyTorch and Positional Encoding

Please find the dataset here: [IMDB Dataset](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)


50K movie reviews from the IMDB dataset are available for text analytics or natural language processing. Compared to earlier benchmark datasets, this dataset for binary sentiment classification has a significantly larger amount of data. For training, it offers a set of 25,000 highly polar movie reviews, and for testing, it offers another 25,000.

## Importing Required Packages

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB                       # Import IMDB dataset from torchtext
from torchtext.data.utils import get_tokenizer            # Import tokenizer utility from torchtext
from torchtext.vocab import build_vocab_from_iterator     # Import vocabulary building function from torchtext.vocab
from torchtext.vocab import Vocab                         # Import Vocab class from torchtext.vocab
from torch.utils.data import DataLoader                   # Import DataLoader from torch.utils.data
from torch.nn.utils.rnn import pad_sequence               # Import pad_sequence utility from torch.nn.utils.rnn


## Creating Vocabulary

* **Tokenization and Download**: To prepare the text data, it first downloads the IMDB dataset and sets up a simple English tokenizer. 
* **Build a vocabulary**: tokenized sequences from the IMDB dataset are assembled using the build_vocab_from_iterator function.

In [2]:
# Download and tokenize IMDB dataset
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Create vocabulary
train_iter = IMDB(split='train')
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [4]:
# Iterate through the dataset to get its length
dataset_length = sum(1 for _ in train_iter)
print("Length of the dataset:", dataset_length)

Length of the dataset: 25000


## Text Data Processing

The below functions defined help to prepare the data for training by converting text and label inputs into numerical representations, ensuring consistent sequence lengths within batches, and structuring the data in a way that is compatible with inputting into a neural network.

In [5]:
# Text data processing function
def text_pipeline(text):
    return [vocab[token] for token in tokenizer(text)]

# Label data processing function
def label_pipeline(label):
    return 1 if label == 'pos' else 0

# Truncate sequences that exceed MAX_SEQ_LEN
MAX_SEQ_LEN = 2048

def truncate_sequence(seq):
    return seq[:MAX_SEQ_LEN]

# Collate function to process batch data
def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(torch.tensor(label_pipeline(_label), dtype=torch.long))
        processed_text = torch.tensor(truncate_sequence(text_pipeline(_text)), dtype=torch.long)
        text_list.append(processed_text)
    text_list = pad_sequence(text_list, padding_value=vocab["<pad>"])
    return torch.transpose(text_list, 0, 1), torch.stack(label_list)

## Positional Encoding

This code enables the model to capture sequential information in a parameterized way by dynamically appending learnable positional encodings to input sequences.

In [6]:
# Learnable positional encoding module
class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, embedding_dim):
        super(PositionalEncoding, self).__init__()
        self.embedding_dim = embedding_dim
        self.positional_encoding = nn.Parameter(torch.randn(max_seq_len, embedding_dim))
        
    def forward(self, x):
        seq_len = x.size(1)
        # Limit the positional encoding to the current sequence length
        positional_encoding = self.positional_encoding[:seq_len, :].unsqueeze(0).repeat(x.size(0), 1, 1)
        # Add positional encoding to the input
        x = x + positional_encoding
        return x

## Model Architecture

In [7]:
# Model
class Model(nn.Module):
    def __init__(self, vocab_size, input_dim, hidden_dim, max_seq_len):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_dim, padding_idx=vocab["<pad>"])
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 2)  # Binary classification (pos/neg)
        self.positional_encoding = PositionalEncoding(max_seq_len=max_seq_len, embedding_dim=input_dim)
    
    def forward(self, x):
        # Embedding
        x = self.embedding(x)
        # Add positional encoding
        x = self.positional_encoding(x)
        # RNN forward pass
        out, _ = self.rnn(x)
        # Linear layer
        out = self.linear(out[:, -1, :])
        return out

## Training

In [8]:
# Training function
def train(model, criterion, optimizer, dataloader, device, epochs=10):
    model.train()
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        num_batches = 0  # Count the number of batches
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            # Forward pass through the model
            output = model(data)
            # Compute the loss
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            num_batches += 1
        avg_loss = epoch_loss / num_batches
        print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

## Evaluation

In [9]:
# Evaluation of the model
def evaluate(model, criterion, dataloader, device):
    model.eval()
    model.to(device)
    total_loss = 0
    correct = 0
    num_batches = 0
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            total_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            num_batches += 1
    avg_loss = total_loss / num_batches
    accuracy = correct / len(dataloader.dataset)
    return avg_loss, accuracy

For training and testing purposes, this class offers an interface to access samples from the IMDB dataset.

In [13]:
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    def __init__(self, split='train'):
        self.dataset = list(IMDB(split=split))  

    def __getitem__(self, index):
        return self.dataset[index]

    def __len__(self):
        return len(self.dataset)


train_dataset = IMDBDataset(split='train')
test_dataset = IMDBDataset(split='test')

In [11]:
# Hyperparameters
BATCH_SIZE = 32
INPUT_DIM = 128  # Dimension of input features/embeddings
HIDDEN_DIM = 64
LEARNING_RATE = 0.001
EPOCHS = 5

In [14]:
# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create dataloaders for the IMDB dataset
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [15]:
# Create model, loss function, and optimizer
model = Model(vocab_size=len(vocab), input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM, max_seq_len=MAX_SEQ_LEN)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [16]:
# Train the model
train(model, criterion, optimizer, train_dataloader, device, epochs=EPOCHS)

Epoch 1, Loss: 0.051114533210790014
Epoch 2, Loss: 0.00011741973729470692
Epoch 3, Loss: 5.1647593840629775e-05
Epoch 4, Loss: 2.799364862890388e-05
Epoch 5, Loss: 1.6617554856821998e-05


In [17]:
# evaluate the model
test_loss, test_accuracy = evaluate(model, criterion, test_dataloader, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.0000, Test Accuracy: 1.0000
