In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re
import os

# ========== Step 1: Load Data ==========
def load_data():
    with open("reviews.txt", "r", encoding="utf-8") as f:
        reviews = [line.strip() for line in f.readlines()]
    with open("labels.txt", "r", encoding="utf-8") as f:
        labels = [line.strip().lower() for line in f.readlines()]
    print(f"✅ Loaded {len(reviews)} reviews and {len(labels)} labels.")
    return reviews, labels

In [5]:
# ========== Step 2: Preprocessing ==========
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

# ========== Step 3: Build Vocabulary ==========
def build_vocab(tokenized_reviews, min_freq=2):
    counter = Counter()
    for tokens in tokenized_reviews:
        counter.update(tokens)
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    print(f"✅ Vocabulary size: {len(vocab)} (min_freq={min_freq})")
    return vocab

def encode_review(tokens, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

In [6]:
# ========== Step 4: Dataset & DataLoader ==========
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, vocab):
        self.encoded_reviews = [torch.tensor(encode_review(tokens, vocab)) for tokens in reviews]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.encoded_reviews[idx], self.labels[idx]

def collate_fn(batch):
    reviews, labels = zip(*batch)
    padded_reviews = pad_sequence(reviews, batch_first=True)
    labels = torch.tensor(labels)
    return padded_reviews, labels

In [7]:
# ========== Step 5: LSTM Model ==========
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embeds = self.embedding(x)
        _, (hidden, _) = self.lstm(embeds)
        out = self.fc(hidden[-1])
        return self.sigmoid(out).squeeze()

In [38]:
# ========== Step 6: Training ==========
def train_model(model, dataloader, epochs=2000, save_every=500, save_path="checkpoints"):
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    model.train()
    device = next(model.parameters()).device

    os.makedirs(save_path, exist_ok=True)

    for epoch in range(epochs):
        total_loss = 0
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            output = model(batch_x)
            loss = criterion(output, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)


        # Save model every `save_every` epochs
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
            checkpoint_file = os.path.join(save_path, f"model_epoch{epoch+1}.pt")
            torch.save(model.state_dict(), checkpoint_file)
            print(f"✅ Saved model checkpoint to: {checkpoint_file}")

In [39]:
# ========== Step 7: Predict ==========
def predict(model, review, vocab):
    model.eval()
    device = next(model.parameters()).device
    tokens = preprocess(review)
    print(f"📝 Tokenized: {tokens}")
    encoded = torch.tensor([encode_review(tokens, vocab)]).to(device)
    print(f"🔢 Encoded: {encoded}")
    with torch.no_grad():
        output = model(encoded)
        print(f"📈 Raw Score: {output.item():.4f}")
        prediction = "positive" if output.item() > 0.5 else "negative"
        print(f"🧠 Review: \"{review}\"\n🎯 Prediction: {prediction}\n")

In [40]:
# ========== Main ==========
if __name__ == "__main__":
    print("🚀 Loading and preparing data...")
    raw_reviews, raw_labels = load_data()
    raw_reviews = raw_reviews
    raw_labels = raw_labels

    tokenized_reviews = [preprocess(r) for r in raw_reviews]
    print(f"🧹 Sample tokens: {tokenized_reviews[0][:10]}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    vocab = build_vocab(tokenized_reviews)
    le = LabelEncoder()
    numeric_labels = le.fit_transform(raw_labels)  # 0 = negative, 1 = positive
    print(f"🔠 Label Mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

    X_train, X_test, y_train, y_test = train_test_split(
        tokenized_reviews, numeric_labels, test_size=0.2, random_state=42)

    print(f"📊 Split: {len(X_train)} train / {len(X_test)} test")

    train_dataset = ReviewDataset(X_train, y_train, vocab)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

    model = SentimentLSTM(vocab_size=len(vocab), embed_dim=100, hidden_dim=128)
    print("🧠 Starting training...")
    model = model.to(device)

    train_model(model, train_loader, epochs=50,save_every=20)

    print("\n🔍 Testing predictions on sample reviews...")
    predict(model, "This movie was absolutely wonderful!", vocab)
    predict(model, "Worst film I have ever seen.", vocab)

🚀 Loading and preparing data...
✅ Loaded 25000 reviews and 25000 labels.
🧹 Sample tokens: ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']
✅ Vocabulary size: 8986 (min_freq=2)
🔠 Label Mapping: {np.str_('negative'): np.int64(0), np.str_('positive'): np.int64(1)}
📊 Split: 800 train / 200 test
🧠 Starting training...
Epoch 20, Loss: 0.6752
✅ Saved model checkpoint to: checkpoints/model_epoch20.pt
Epoch 40, Loss: 0.6730
✅ Saved model checkpoint to: checkpoints/model_epoch40.pt

🔍 Testing predictions on sample reviews...
📝 Tokenized: ['this', 'movie', 'was', 'absolutely', 'wonderful']
🔢 Encoded: tensor([[ 319,  636,  413, 1865, 1621]], device='cuda:0')
📈 Raw Score: 0.6197
🧠 Review: "This movie was absolutely wonderful!"
🎯 Prediction: positive

📝 Tokenized: ['worst', 'film', 'i', 'have', 'ever', 'seen']
🔢 Encoded: tensor([[800, 320,  56, 428, 604, 155]], device='cuda:0')
📈 Raw Score: 0.0535
🧠 Review: "Worst film I have ever seen."
🎯 Prediction: negative

