In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:\SNLP\IMDB_Dataset.csv")

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cpu


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Load Dataset
df = pd.read_csv("C:/SNLP/IMDB_Dataset.csv")
print(df.head())
# Data Cleaning
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back to string
    text = ' '.join(tokens)
    return text

tqdm.pandas()
df['clean_review'] = df['review'].progress_apply(clean_text)
print(df[['review', 'clean_review']].head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


100%|████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:14<00:00, 667.76it/s]

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        clean_review  
0  one reviewers mentioned watching oz episode ho...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically family little boy jake thinks zombie...  
4  petter mattei love time money visually stunnin...  





In [11]:
# Tokenization and Padding
from collections import Counter

# Build vocabulary
all_text = ' '.join(df['clean_review'])
words = all_text.split()
count_words = Counter(words)
total_words = len(words)
vocab = sorted(count_words, key=count_words.get, reverse=True)
vocab_to_int = {word: idx+1 for idx, word in enumerate(vocab)}  # idx+1 because 0 is used for padding

def encode_text(text):
    tokens = text.split()
    encoded = [vocab_to_int.get(word, 0) for word in tokens]  # Use 0 if word not found
    return encoded

df['encoded_review'] = df['clean_review'].apply(encode_text)

# Determine max sequence length
max_len = 200

def pad_sequence(seq, max_len):
    if len(seq) >= max_len:
        return seq[:max_len]
    else:
        return [0]*(max_len - len(seq)) + seq

df['padded_review'] = df['encoded_review'].apply(lambda x: pad_sequence(x, max_len))


In [12]:
# Encode Labels
label_to_int = {'positive': 1, 'negative': 0}
df['label'] = df['sentiment'].map(label_to_int)

# Split Data
X = np.array(df['padded_review'].tolist())
y = np.array(df['label'].tolist())

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [13]:
# Create Custom Dataset
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = torch.tensor(reviews, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float)
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        return self.reviews[idx], self.labels[idx]

# Create DataLoaders
batch_size = 64

train_dataset = IMDBDataset(X_train, y_train)
val_dataset = IMDBDataset(X_val, y_val)
test_dataset = IMDBDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [14]:
# Download GloVe Embeddings
import os
import zipfile
import requests

def download_glove_embeddings():
    glove_zip = 'glove.6B.zip'
    if not os.path.exists(glove_zip):
        print("Downloading GloVe embeddings...")
        url = 'http://nlp.stanford.edu/data/glove.6B.zip'
        r = requests.get(url, stream=True)
        with open(glove_zip, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print("Download completed.")
    else:
        print("GloVe embeddings already downloaded.")
    
    # Extract embeddings
    if not os.path.exists('glove.6B.100d.txt'):
        print("Extracting GloVe embeddings...")
        with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
            zip_ref.extractall()
        print("Extraction completed.")
    else:
        print("GloVe embeddings already extracted.")

download_glove_embeddings()


Downloading GloVe embeddings...
Download completed.
Extracting GloVe embeddings...
Extraction completed.


In [15]:
# Create Embedding Matrix
embedding_dim = 100
embedding_index = {}

print("Loading GloVe embeddings...")
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in tqdm(f, total=400000):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector
print("GloVe embeddings loaded.")

# Prepare embedding matrix
vocab_size = len(vocab_to_int) + 1  # +1 for padding token
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in vocab_to_int.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
    else:
        # Initialize with random vector if not found
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim, ))


Loading GloVe embeddings...


100%|████████████████████████████████████████████████████████████████████████████████████| 400000/400000 [00:33<00:00, 11770.50it/s]


GloVe embeddings loaded.


In [17]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    model.to(device)
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        
        for inputs, labels in tqdm(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
        
        val_losses = []
        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                val_losses.append(loss.item())
        
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {np.mean(train_losses):.4f}, Validation Loss: {np.mean(val_losses):.4f}')
        
    print("Training complete.")


In [18]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, embedding_matrix, drop_prob=0.5):
        super(RNNModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Embedding layer with pre-trained weights
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, batch_first=True, nonlinearity='tanh')
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embeds = self.embedding(x)
        rnn_out, hidden = self.rnn(embeds)
        out = rnn_out[:, -1, :]
        out = self.dropout(out)
        out = self.fc(out)
        return self.sigmoid(out)


In [19]:
# Hyperparameters
hidden_dim = 256
n_layers = 2
num_epochs = 5
learning_rate = 0.001

model_rnn_glove = RNNModel(vocab_size, embedding_dim, hidden_dim, n_layers, embedding_matrix)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_rnn_glove.parameters(), lr=learning_rate)

train_model(model_rnn_glove, train_loader, val_loader, criterion, optimizer, num_epochs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [14:53<00:00,  1.43s/it]


Epoch 1/5, Training Loss: 0.6673, Validation Loss: 0.6060


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [12:29<00:00,  1.20s/it]


Epoch 2/5, Training Loss: 0.6747, Validation Loss: 0.7763


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [10:49<00:00,  1.04s/it]


Epoch 3/5, Training Loss: 0.6450, Validation Loss: 0.6213


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [11:12<00:00,  1.08s/it]


Epoch 4/5, Training Loss: 0.6147, Validation Loss: 0.5827


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [13:18<00:00,  1.28s/it]


Epoch 5/5, Training Loss: 0.5562, Validation Loss: 0.6938
Training complete.


In [20]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, embedding_matrix, drop_prob=0.5):
        super(LSTMModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Embedding layer with pre-trained weights
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embeds)
        out = lstm_out[:, -1, :]
        out = self.dropout(out)
        out = self.fc(out)
        return self.sigmoid(out)


In [21]:
model_lstm_glove = LSTMModel(vocab_size, embedding_dim, hidden_dim, n_layers, embedding_matrix)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_lstm_glove.parameters(), lr=learning_rate)

train_model(model_lstm_glove, train_loader, val_loader, criterion, optimizer, num_epochs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [13:29<00:00,  1.30s/it]


Epoch 1/5, Training Loss: 0.4545, Validation Loss: 0.3230


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [10:30<00:00,  1.01s/it]


Epoch 2/5, Training Loss: 0.2577, Validation Loss: 0.3020


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [11:39<00:00,  1.12s/it]


Epoch 3/5, Training Loss: 0.1410, Validation Loss: 0.3293


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [12:34<00:00,  1.21s/it]


Epoch 4/5, Training Loss: 0.0665, Validation Loss: 0.4186


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [12:46<00:00,  1.23s/it]


Epoch 5/5, Training Loss: 0.0290, Validation Loss: 0.6069
Training complete.


In [22]:
class RNNModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(RNNModelOnTheFly, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Trainable embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, batch_first=True, nonlinearity='tanh')
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embeds = self.embedding(x)
        rnn_out, hidden = self.rnn(embeds)
        out = rnn_out[:, -1, :]
        out = self.dropout(out)
        out = self.fc(out)
        return self.sigmoid(out)


In [23]:
model_rnn_onthefly = RNNModelOnTheFly(vocab_size, embedding_dim, hidden_dim, n_layers)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_rnn_onthefly.parameters(), lr=learning_rate)

train_model(model_rnn_onthefly, train_loader, val_loader, criterion, optimizer, num_epochs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [19:22<00:00,  1.86s/it]


Epoch 1/5, Training Loss: 0.6943, Validation Loss: 0.6906


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [16:58<00:00,  1.63s/it]


Epoch 2/5, Training Loss: 0.6777, Validation Loss: 0.6776


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [16:52<00:00,  1.62s/it]


Epoch 3/5, Training Loss: 0.6410, Validation Loss: 0.6035


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [13:27<00:00,  1.29s/it]


Epoch 4/5, Training Loss: 0.6998, Validation Loss: 0.6938


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [12:26<00:00,  1.19s/it]


Epoch 5/5, Training Loss: 0.6942, Validation Loss: 0.6687
Training complete.


In [24]:
class LSTMModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(LSTMModelOnTheFly, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Trainable embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embeds)
        out = lstm_out[:, -1, :]
        out = self.dropout(out)
        out = self.fc(out)
        return self.sigmoid(out)


In [25]:
model_lstm_onthefly = LSTMModelOnTheFly(vocab_size, embedding_dim, hidden_dim, n_layers)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_lstm_onthefly.parameters(), lr=learning_rate)

train_model(model_lstm_onthefly, train_loader, val_loader, criterion, optimizer, num_epochs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [13:40<00:00,  1.31s/it]


Epoch 1/5, Training Loss: 0.6658, Validation Loss: 0.6705


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [12:33<00:00,  1.21s/it]


Epoch 2/5, Training Loss: 0.4360, Validation Loss: 0.3324


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [11:52<00:00,  1.14s/it]


Epoch 3/5, Training Loss: 0.2572, Validation Loss: 0.2955


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [12:14<00:00,  1.18s/it]


Epoch 4/5, Training Loss: 0.1829, Validation Loss: 0.3082


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [11:46<00:00,  1.13s/it]


Epoch 5/5, Training Loss: 0.1247, Validation Loss: 0.3571
Training complete.


In [28]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = outputs.squeeze().round().cpu().numpy()
            labels = labels.cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


In [29]:
# Evaluate the GloVe RNN model
print("Evaluating GloVe RNN Model")
evaluate_model(model_rnn_glove, test_loader)

# Evaluate the GloVe LSTM model
print("Evaluating GloVe LSTM Model")
evaluate_model(model_lstm_glove, test_loader)

# Evaluate the On-the-Fly RNN model
print("Evaluating On-the-Fly RNN Model")
evaluate_model(model_rnn_onthefly, test_loader)

# Evaluate the On-the-Fly LSTM model
print("Evaluating On-the-Fly LSTM Model")
evaluate_model(model_lstm_onthefly, test_loader)


Evaluating GloVe RNN Model
Accuracy: 0.4936
Precision: 0.5294
Recall: 0.0213
F1 Score: 0.0409
Evaluating GloVe LSTM Model
Accuracy: 0.8780
Precision: 0.8850
Recall: 0.8731
F1 Score: 0.8790
Evaluating On-the-Fly RNN Model
Accuracy: 0.6196
Precision: 0.6039
Recall: 0.7281
F1 Score: 0.6602
Evaluating On-the-Fly LSTM Model
Accuracy: 0.8782
Precision: 0.8737
Recall: 0.8885
F1 Score: 0.8810


{'accuracy': 0.8782,
 'precision': 0.8736923672994963,
 'recall': 0.88849487785658,
 'f1': 0.8810314514553623}