In [88]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from datasets import load_dataset
import re
from collections import Counter
import json


In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [68]:
# Load IMDB dataset
imdb = load_dataset("imdb")
full_train = imdb["train"]

# Split full train into train + validation
split = full_train.train_test_split(test_size=0.2, seed=42)

train_data = split["train"]   # 80% of full_train
val_data   = split["test"]    # 20% of full_train
test_data  = imdb["test"]     # official test set

print(f"Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")


Using the latest cached version of the dataset since imdb couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at C:\Users\ANAKIN\.cache\huggingface\datasets\imdb\plain_text\0.0.0\e6281661ce1c48d982bc483cf8a173c1bbeb5d31 (last modified on Sat Sep 27 03:34:18 2025).


Train size: 20000, Val size: 5000, Test size: 25000


In [69]:
# Tokenizer (simple whitespace)
def simple_tokenizer(text):
    return re.findall(r"\b\w+\b", text.lower())


In [70]:
def build_vocab(dataset, max_size=20000, min_freq=2):
    counter = Counter()
    for example in dataset:
        counter.update(simple_tokenizer(example["text"]))
    itos = ["<pad>", "<unk>"]
    for word, freq in counter.most_common(max_size - len(itos)):
        if freq >= min_freq:
            itos.append(word)
    stoi = {w: i for i, w in enumerate(itos)}
    return stoi, itos

In [71]:
def encode(text, max_len=200):
    tokens = simple_tokenizer(text)
    ids = [stoi.get(tok, 1) for tok in tokens]  # 1 = <unk>
    return torch.tensor(ids[:max_len], dtype=torch.long)

In [72]:
def collate_batch(batch):
    text_list, label_list, lengths = [], [], []
    for example in batch:
        ids = encode(example["text"])
        text_list.append(ids)
        label_list.append(example["label"])
        lengths.append(len(ids))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)  # 0 = <pad>
    label_list = torch.tensor(label_list, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.long)
    return text_list, label_list, lengths

In [74]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, dropout, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)  # <pad> = 0
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, (h, c) = self.lstm(packed)
        out = self.dropout(h[-1])   # last hidden state
        return self.fc(out)


In [73]:
# Model
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # *2 for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, (h, c) = self.lstm(packed)
        # h shape: (num_layers * num_directions, batch, hidden_dim)
        h_forward = h[-2]   # last layer forward
        h_backward = h[-1]  # last layer backward
        h_cat = torch.cat((h_forward, h_backward), dim=1)
        return self.fc(self.dropout(h_cat))

In [75]:
# Inference function
def predict_sentiment(model, text, max_len=200):
    model.eval()
    # Encode and truncate
    tokens = encode(text, max_len=max_len)
    length = torch.tensor([len(tokens)], dtype=torch.long)

    # Add batch dimension
    tokens = tokens.unsqueeze(0)  # shape (1, seq_len)

    # Move to device
    tokens, length = tokens.to(device), length.to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(tokens, length)
        probs = torch.softmax(outputs, dim=1)
        pred_class = probs.argmax(1).item()

    return pred_class, probs.squeeze().tolist()




In [76]:
# Hyperparameters
embed_dim = 200
hidden_dim = 256
batch_size = 64
num_epochs = 10
lr = 0.005
dropout = 0.5
weight_decay = 1e-5
num_classes = 2

patience = 4
best_val_loss = float("inf")
patience_counter = 0


In [77]:
stoi, itos = build_vocab(train_data)
vocab_size = len(itos)
print("Vocab size:", vocab_size)

Vocab size: 20000


In [89]:
# Save stoi (string-to-index mapping)
with open("vocab.json", "w") as f:
    json.dump(stoi, f)

# (Optional) also save itos (index-to-string mapping)
with open("itos.json", "w") as f:
    json.dump(itos, f)


In [78]:
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader   = DataLoader(val_data,   batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dataloader  = DataLoader(test_data,  batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [79]:
# LSTM Model, Loss, Optimizer

lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, dropout, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=lr, weight_decay=1e-5)

In [80]:
#LSTM Training

for epoch in range(num_epochs):
    # Training
    lstm_model.train()
    total_loss, total_correct = 0, 0
    for texts, labels, lengths in train_dataloader:
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
        outputs = lstm_model(texts, lengths)

        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    avg_train_loss = total_loss / len(train_dataloader.dataset)
    train_acc = total_correct / len(train_dataloader.dataset)

    # Validation
    lstm_model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for texts, labels, lengths in val_dataloader:
            texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
            outputs = lstm_model(texts, lengths)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * labels.size(0)
            val_correct += (outputs.argmax(1) == labels).sum().item()
            val_total += labels.size(0)

    avg_val_loss = val_loss / val_total
    val_acc = val_correct / val_total

    print(f"Epoch {epoch+1}: "
          f"Train Loss {avg_train_loss:.4f}, Train Acc {train_acc:.4f} | "
          f"Val Loss {avg_val_loss:.4f}, Val Acc {val_acc:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(lstm_model.state_dict(), "best_lstm.pt")
    else:
        patience_counter += 1
        print(f"No improvement. Patience counter = {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# Load best model
lstm_model.load_state_dict(torch.load("best_lstm.pt"))

Epoch 1: Train Loss 0.6687, Train Acc 0.5919 | Val Loss 0.6276, Val Acc 0.6364
Epoch 2: Train Loss 0.4766, Train Acc 0.7791 | Val Loss 0.3507, Val Acc 0.8510
Epoch 3: Train Loss 0.3221, Train Acc 0.8721 | Val Loss 0.3806, Val Acc 0.8534
No improvement. Patience counter = 1/4
Epoch 4: Train Loss 0.2322, Train Acc 0.9128 | Val Loss 0.3414, Val Acc 0.8614
Epoch 5: Train Loss 0.1745, Train Acc 0.9373 | Val Loss 0.3718, Val Acc 0.8556
No improvement. Patience counter = 1/4
Epoch 6: Train Loss 0.1358, Train Acc 0.9508 | Val Loss 0.3988, Val Acc 0.8532
No improvement. Patience counter = 2/4
Epoch 7: Train Loss 0.1192, Train Acc 0.9568 | Val Loss 0.4922, Val Acc 0.8552
No improvement. Patience counter = 3/4
Epoch 8: Train Loss 0.1061, Train Acc 0.9627 | Val Loss 0.4418, Val Acc 0.8572
No improvement. Patience counter = 4/4
Early stopping triggered.


<All keys matched successfully>

In [83]:
# BiLSTM Model, Loss, Optimizer

bi_lstm_model = BiLSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bi_lstm_model.parameters(), lr=lr, weight_decay=weight_decay)

In [84]:
# Reset early stopping variables

best_val_loss = float("inf")
patience_counter = 0


In [85]:
#BiLSTM training

for epoch in range(num_epochs):
    # Training
    bi_lstm_model.train()
    total_loss, total_correct = 0, 0
    for texts, labels, lengths in train_dataloader:
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
        outputs = bi_lstm_model(texts, lengths)

        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    avg_train_loss = total_loss / len(train_dataloader.dataset)
    train_acc = total_correct / len(train_dataloader.dataset)

    # Validation
    bi_lstm_model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for texts, labels, lengths in val_dataloader:   
            texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
            outputs = bi_lstm_model(texts, lengths)             
            loss = criterion(outputs, labels)

            val_loss += loss.item() * labels.size(0)
            val_correct += (outputs.argmax(1) == labels).sum().item()
            val_total += labels.size(0)

    avg_val_loss = val_loss / val_total
    val_acc = val_correct / val_total

    print(f"Epoch {epoch+1}: "
          f"Train Loss {avg_train_loss:.4f}, Train Acc {train_acc:.4f} | "
          f"Val Loss {avg_val_loss:.4f}, Val Acc {val_acc:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(bi_lstm_model.state_dict(), "best_bilstm.pt")
    else:
        patience_counter += 1
        print(f"No improvement. Patience counter = {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# Load best model
bi_lstm_model.load_state_dict(torch.load("best_bilstm.pt"))


Epoch 1: Train Loss 0.6416, Train Acc 0.6248 | Val Loss 0.5035, Val Acc 0.7676
Epoch 2: Train Loss 0.3846, Train Acc 0.8307 | Val Loss 0.3472, Val Acc 0.8550
Epoch 3: Train Loss 0.2479, Train Acc 0.9025 | Val Loss 0.3379, Val Acc 0.8626
Epoch 4: Train Loss 0.1718, Train Acc 0.9358 | Val Loss 0.3614, Val Acc 0.8606
No improvement. Patience counter = 1/4
Epoch 5: Train Loss 0.1237, Train Acc 0.9543 | Val Loss 0.4513, Val Acc 0.8474
No improvement. Patience counter = 2/4
Epoch 6: Train Loss 0.0995, Train Acc 0.9607 | Val Loss 0.4661, Val Acc 0.8468
No improvement. Patience counter = 3/4
Epoch 7: Train Loss 0.0907, Train Acc 0.9670 | Val Loss 0.4426, Val Acc 0.8404
No improvement. Patience counter = 4/4
Early stopping triggered.


<All keys matched successfully>

In [86]:
# Evaluation of both models
lstm_model.eval()
bi_lstm_model.eval()

lstm_test_loss, lstm_correct, lstm_total = 0, 0, 0
bi_lstm_test_loss, bi_lstm_correct, bi_lstm_total = 0, 0, 0

with torch.no_grad():
    for texts, labels, lengths in test_dataloader:
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)

        # Forward passes
        lstm_outputs = lstm_model(texts, lengths)
        bi_lstm_outputs = bi_lstm_model(texts, lengths)

        # Losses
        lstm_loss = criterion(lstm_outputs, labels)
        bi_lstm_loss = criterion(bi_lstm_outputs, labels)

        # Accumulate for LSTM
        lstm_test_loss += lstm_loss.item() * labels.size(0)
        lstm_correct += (lstm_outputs.argmax(1) == labels).sum().item()
        lstm_total += labels.size(0)

        # Accumulate for BiLSTM
        bi_lstm_test_loss += bi_lstm_loss.item() * labels.size(0)
        bi_lstm_correct += (bi_lstm_outputs.argmax(1) == labels).sum().item()
        bi_lstm_total += labels.size(0)

print(f"LSTM    Test Loss: {lstm_test_loss/lstm_total:.4f}, Test Accuracy: {lstm_correct/lstm_total:.4f}")
print(f"BiLSTM  Test Loss: {bi_lstm_test_loss/bi_lstm_total:.4f}, Test Accuracy: {bi_lstm_correct/bi_lstm_total:.4f}")


LSTM    Test Loss: 0.3865, Test Accuracy: 0.8429
BiLSTM  Test Loss: 0.3849, Test Accuracy: 0.8400


In [87]:
# Model Testing for a sample movie review

review = "The movie was awfully boring, cannot recommend it."

# LSTM prediction
pred_class_lstm, probs_lstm = predict_sentiment(lstm_model, review)
print("LSTM : Prediction:", "Positive" if pred_class_lstm == 1 else "Negative")
print("Probabilities:", probs_lstm)

# BiLSTM prediction
pred_class_bi, probs_bi = predict_sentiment(bi_lstm_model, review)
print("BiLSTM : Prediction:", "Positive" if pred_class_bi == 1 else "Negative")
print("Probabilities:", probs_bi)


LSTM : Prediction: Negative
Probabilities: [0.9793155193328857, 0.02068454399704933]
BiLSTM : Prediction: Negative
Probabilities: [0.9854004979133606, 0.014599491842091084]
