In [1]:
# using the processed data that we worked on in task 1
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences





In [3]:
# Load & Preprocess Dataset
# Load DataFrame
processed_data_file = 'processed_reviews.pkl' # change the file path if needed
df = pd.read_pickle(processed_data_file)

# Tokenization and Padding
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
X = pad_sequences(sequences, maxlen=max_len)
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values


In [4]:
# Dataset loader
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_ds = SentimentDataset(X_train, y_train)
val_ds = SentimentDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)


In [6]:
# The model 
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        h = torch.cat((h_n[0], h_n[1]), dim=1)
        out = self.fc(h)
        return self.sigmoid(out).squeeze()


In [8]:
# Training loop utilizing the gpu device 
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n✅ Using device: {device}")
if device.type == "cuda":
    print(f"✅ GPU name: {torch.cuda.get_device_name(0)}\n")

# Model, Loss, Optimizer
model = BiLSTMClassifier(vocab_size=max_words, embed_dim=128, hidden_dim=64).to(device)
print(f"Model is on device: {next(model.parameters()).device}")

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training Loop
for epoch in range(10):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

    # Optional: Show GPU memory usage
    if device.type == "cuda":
        mem_alloc = torch.cuda.memory_allocated() / 1024**2
        print(f"GPU Memory Allocated: {mem_alloc:.2f} MB\n")



✅ Using device: cuda
✅ GPU name: NVIDIA GeForce RTX 4060 Laptop GPU

Model is on device: cuda:0
Epoch 1, Loss: 0.4620
GPU Memory Allocated: 43.05 MB

Epoch 2, Loss: 0.2937
GPU Memory Allocated: 43.05 MB

Epoch 3, Loss: 0.2195
GPU Memory Allocated: 43.05 MB

Epoch 4, Loss: 0.1622
GPU Memory Allocated: 43.05 MB

Epoch 5, Loss: 0.1114
GPU Memory Allocated: 43.05 MB

Epoch 6, Loss: 0.0700
GPU Memory Allocated: 43.05 MB

Epoch 7, Loss: 0.0477
GPU Memory Allocated: 43.05 MB

Epoch 8, Loss: 0.0299
GPU Memory Allocated: 43.05 MB

Epoch 9, Loss: 0.0282
GPU Memory Allocated: 43.05 MB

Epoch 10, Loss: 0.0244
GPU Memory Allocated: 43.05 MB



In [9]:
# Evaluation 
model.eval()
y_preds, y_true = [], []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        y_preds.extend(outputs.cpu().numpy())
        y_true.extend(labels.numpy())

y_pred_bin = (np.array(y_preds) > 0.5).astype(int)

f1 = f1_score(y_true, y_pred_bin)
precision = precision_score(y_true, y_pred_bin)
recall = recall_score(y_true, y_pred_bin)
accuracy = accuracy_score(y_true, y_pred_bin)
conf_matrix = confusion_matrix(y_true, y_pred_bin)

print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


F1 Score: 0.8666
Precision: 0.8759
Recall: 0.8575
Accuracy: 0.8670
Confusion Matrix:
[[4349  612]
 [ 718 4321]]


In [10]:
torch.save(model.state_dict(), "bilstm_sentiment_model_raw.pt")
