# Sentiment Classification using RNN in PyTorch
This notebook demonstrates how to build a simple Recurrent Neural Network (RNN) for sentiment Classification using PyTorch. We will preprocess text data using RegEx and nltk, train an RNN model, and evaluate its accuracy for sentiment classification.

## Step 1: Install Dependencies

In [22]:
!pip install datasets tqdm --quiet

## Step 2: Import Libraries

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from collections import Counter

In [24]:
# Ensure nltk requirements are downloaded
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [25]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Step 3: Data Preparation

In [26]:
# Load IMDb Dataset from Hugging Face
from datasets import load_dataset
dataset = load_dataset("imdb")

# Text Preprocessing Function
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    return tokens

# Apply preprocessing
dataset['train'] = dataset['train'].map(lambda x: {"text": preprocess_text(x['text'])})
dataset['test'] = dataset['test'].map(lambda x: {"text": preprocess_text(x['text'])})

In [27]:
# Build vocabulary
word_to_index = {}
index = 1  # Start indexing from 1 (0 for padding)

for data in dataset['train']['text']:
    for word in data:
        if word not in word_to_index:
            word_to_index[word] = index
            index += 1

# Convert text to numerical sequences
def encode_text(text):#[x1,x2,x3] [10, 8, 100]
    return [word_to_index.get(word, 0) for word in text]

dataset['train'] = dataset['train'].map(lambda x: {'text': encode_text(x['text']), 'label': x['label']})
dataset['test'] = dataset['test'].map(lambda x: {'text': encode_text(x['text']), 'label': x['label']})

# Pad sequences to fixed length
MAX_LEN = 100

def pad_sequence(seq, max_len):
    return seq[:max_len] + [0] * (max_len - len(seq))

dataset['train'] = dataset['train'].map(lambda x: {'text': pad_sequence(x['text'], MAX_LEN), 'label': x['label']})
dataset['test'] = dataset['test'].map(lambda x: {'text': pad_sequence(x['text'], MAX_LEN), 'label': x['label']})


In [28]:
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train = torch.tensor(dataset['train']['text'], dtype=torch.long)
y_train = torch.tensor(dataset['train']['label'], dtype=torch.long)
X_test = torch.tensor(dataset['test']['text'], dtype=torch.long)
y_test = torch.tensor(dataset['test']['label'], dtype=torch.long)

# Create DataLoaders
BATCH_SIZE = 32

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

## Step 4: Define the Sequence Models

## RNN

In [29]:
# Define RNN Model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=1, bidirectional=False, device=device)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))


## LSTM

In [30]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=1, bidirectional=False, device=device)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)    
        return self.fc(hidden.squeeze(0))

## Bidirectional LSTM

In [31]:
class LSTMBIClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=1, bidirectional=True, device=device)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden)

## Multilayer LSTM

In [None]:
class LSTMMULClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, 
                           num_layers=num_layers, bidirectional=True, device=device)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden)

In [33]:
# Model hyperparameters
VOCAB_SIZE = len(word_to_index) + 1
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMMULClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [34]:
# Training loop
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    # progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
    for batch in train_loader:
        texts, labels = batch
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    if epoch%1 == 0:
      print(f"Epoch {epoch}, Loss: {epoch_loss / len(train_loader):.4f}")

Epoch 0, Loss: 0.6582
Epoch 1, Loss: 0.4172
Epoch 2, Loss: 0.2550
Epoch 3, Loss: 0.1500
Epoch 4, Loss: 0.0729
Epoch 5, Loss: 0.0424
Epoch 6, Loss: 0.0223
Epoch 7, Loss: 0.0180
Epoch 8, Loss: 0.0150
Epoch 9, Loss: 0.0117


In [35]:
def evaluate(model, data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            texts, labels = batch
            texts, labels = texts.to(device), labels.to(device)

            predictions = model(texts)
            predicted = predictions.argmax(1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    return correct / total

test_accuracy = evaluate(model, test_loader)
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.79


In [36]:
# Function to predict sentiment on new sentences
def predict_sentiment(model, sentence, max_len):
    model.eval()
    sequence = encode_text(preprocess_text(sentence))
    padded_sequence = pad_sequence(sequence, max_len)
    tensor = torch.LongTensor(padded_sequence).unsqueeze(0).to(device)
    # prediction = torch.sigmoid(model(tensor).squeeze(1))
    prediction = model(tensor)
    return prediction.argmax(1).item()

# Example usage
sentence = "Great film!"
prediction = predict_sentiment(model, sentence, MAX_LEN)
print(f'Sentence: {sentence} \nSentiment: {"+" if prediction==1 else "-"}')  # 0 for negative, 1 for positive

sentence = "Bad film!"
prediction = predict_sentiment(model, sentence, MAX_LEN)
print(f'Sentence: {sentence} \nSentiment: {"+" if prediction==1 else "-"}')  # 0 for negative, 1 for positive

Sentence: Great film! 
Sentiment: +
Sentence: Bad film! 
Sentiment: -


## Report

| **Model**       | **Training Loss Epoch 1** | **Training Loss Epoch 5** | **Training Loss Epoch 10** | **Test Accuracy (%)** |
|-----------------|--------------------------|--------------------------|---------------------------|-----------------------|
| RNN             | 0.70                    | 0.67                    | 0.48                     | 0.56                  |
| LSTM            | 0.69                    | 0.17                    | 0.013                     | 0.80                 |
| Bidirectional LSTM| 0.65                    | 0.11                    | 0.011                     | 0.80                  |
| Multilayer LSTM | 0.65                    | 0.07                    | 0.010                     | 0.79                  |