# Yelp Review Prediction - LSTM Model (PyTorch)
**Author:** Ben

**Project:** Capstone - Star Rating Prediction

**Objective:** Train and evaluate a Bidirectional LSTM model using PyTorch with CUDA acceleration

---

## Model Architecture

**Bidirectional LSTM:** Sophisticated sequence model that can capture long-range dependencies and context from both directions

**Why PyTorch:**
- More flexible than Keras for custom architectures
- Better for research and experimentation
- Excellent CUDA support for GPU acceleration
- Industry standard for deep learning research

## 1. Setup and Imports

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import os
import pickle
from collections import Counter
import time

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Sklearn for metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# Set visualization defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

# Create directories
os.makedirs('../Outputs/Models', exist_ok=True)
os.makedirs('../Outputs/Plots', exist_ok=True)

print("Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")

Libraries imported successfully
PyTorch version: 2.10.0+cu128


In [2]:
# Check CUDA availability and setup device
print("=" * 80)
print("CUDA SETUP")
print("=" * 80)

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"\nCUDA is available!")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  CUDA Version: {torch.version.cuda}")
    print(f"  Number of GPUs: {torch.cuda.device_count()}")
    print(f"  Current GPU Memory:")
    print(f"    Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"    Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
else:
    device = torch.device('cpu')
    print("\nCUDA is not available. Using CPU.")
    print("  Training will be significantly slower.")
    print("  Consider using Google Colab for free GPU access.")

print(f"\nUsing device: {device}")

# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print(f"\nRandom seed set to {SEED} for reproducibility")

CUDA SETUP

CUDA is available!
  Device: NVIDIA GeForce RTX 5070 Ti
  CUDA Version: 12.8
  Number of GPUs: 1
  Current GPU Memory:
    Allocated: 0.00 GB
    Cached: 0.00 GB

Using device: cuda

Random seed set to 42 for reproducibility


## 2. Load and Prepare Data

In [3]:
# Load cleaned data
print("=" * 80)
print("LOADING DATA")
print("=" * 80)

# Use sample dataset for faster training during development
USE_SAMPLE = False  # Set to False for full dataset

if USE_SAMPLE:
    print("\nLoading SAMPLE dataset for rapid iteration...")
    data_df = pd.read_csv('../Data/Processed/yelp_sample.csv')
    train_df, test_df = train_test_split(data_df, test_size=0.2, 
                                         random_state=42, stratify=data_df['stars'])
    # Further split train into train and validation
    train_df, val_df = train_test_split(train_df, test_size=0.1, 
                                        random_state=42, stratify=train_df['stars'])
else:
    print("\nLoading FULL dataset...")
    train_df = pd.read_csv('../Data/Processed/yelp_train.csv')
    test_df = pd.read_csv('../Data/Processed/yelp_test.csv')
    # Split off validation set from training
    train_df, val_df = train_test_split(train_df, test_size=0.1, 
                                        random_state=42, stratify=train_df['stars'])

print(f"\nData loaded:")
print(f"  Train: {len(train_df):,} samples")
print(f"  Validation: {len(val_df):,} samples")
print(f"  Test: {len(test_df):,} samples")

# Target distribution
print("\nTarget distribution (train):")
print(train_df['stars'].value_counts(normalize=True).sort_index())


Data loaded:
  Train: 1,800,000 samples
  Validation: 200,000 samples
  Test: 500,000 samples

Target distribution (train):
stars
1    0.2
2    0.2
3    0.2
4    0.2
5    0.2
Name: proportion, dtype: float64


In [4]:
# Text preprocessing and tokenization
print("\n" + "=" * 80)
print("TEXT PREPROCESSING AND TOKENIZATION")
print("=" * 80)

def simple_tokenizer(text):
    """Simple word tokenizer - split on whitespace and lowercase."""
    return text.lower().split()

# Build vocabulary from training data
print("\nBuilding vocabulary...")
all_words = []
for text in train_df['text'].fillna(''):
    all_words.extend(simple_tokenizer(text))

word_counts = Counter(all_words)
print(f"  Total words: {len(all_words):,}")
print(f"  Unique words: {len(word_counts):,}")

# Keep only words that appear at least MIN_FREQ times
MIN_FREQ = 5
vocab = {word for word, count in word_counts.items() if count >= MIN_FREQ}
print(f"  Vocabulary size (min_freq={MIN_FREQ}): {len(vocab):,}")

# Create word to index mapping
# Reserve 0 for padding, 1 for unknown
word2idx = {'<PAD>': 0, '<UNK>': 1}
for idx, word in enumerate(sorted(vocab), start=2):
    word2idx[word] = idx

idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx)

print(f"\nVocabulary created:")
print(f"  Vocab size: {vocab_size:,}")
print(f"  PAD token: {word2idx['<PAD>']}")
print(f"  UNK token: {word2idx['<UNK>']}")
print(f"\nExample words and indices:")
for word in list(word2idx.keys())[2:12]:
    print(f"  '{word}': {word2idx[word]}")


Vocabulary created:
  Vocab size: 245,599
  PAD token: 0
  UNK token: 1

Example words and indices:
  '!': 2
  '!!': 3
  '!!!': 4
  '!!!!': 5
  '!!!!!': 6
  '!!!!!!': 7
  '!!!!!!!': 8
  '!!!!!!!!': 9
  '!!!!!!!!!': 10
  '!!!!!!!!!!': 11


In [5]:
# Convert text to sequences of indices
print("\n" + "=" * 80)
print("CONVERTING TEXT TO SEQUENCES")
print("=" * 80)

def text_to_sequence(text, word2idx, max_len=None):
    """
    Convert text to sequence of word indices.
    
    Args:
        text: Input text string
        word2idx: Word to index mapping
        max_len: Maximum sequence length (truncate if longer)
    
    Returns:
        List of word indices
    """
    tokens = simple_tokenizer(text)
    sequence = [word2idx.get(word, word2idx['<UNK>']) for word in tokens]
    
    if max_len:
        sequence = sequence[:max_len]
    
    return sequence

# Set maximum sequence length based on data
MAX_LEN = 200  # Truncate reviews longer than 200 words

print(f"\nConverting text to sequences (max_len={MAX_LEN})...")
train_sequences = [text_to_sequence(text, word2idx, MAX_LEN) 
                   for text in train_df['text'].fillna('')]
val_sequences = [text_to_sequence(text, word2idx, MAX_LEN) 
                 for text in val_df['text'].fillna('')]
test_sequences = [text_to_sequence(text, word2idx, MAX_LEN) 
                  for text in test_df['text'].fillna('')]

# Get sequence lengths before padding
train_lengths = [len(seq) for seq in train_sequences]
val_lengths = [len(seq) for seq in val_sequences]
test_lengths = [len(seq) for seq in test_sequences]

print(f"\nSequences created")
print(f"  Average length (before padding): {np.mean(train_lengths):.1f} words")
print(f"  Median length: {np.median(train_lengths):.0f} words")
print(f"  Max length (capped): {MAX_LEN} words")

# Show example
print(f"\nExample sequence (first 20 tokens):")
example_text = train_df.iloc[0]['text']
example_seq = train_sequences[0]
print(f"  Original: {example_text[:100]}...")
print(f"  Sequence: {example_seq[:20]}")
print(f"  Decoded: {' '.join([idx2word[idx] for idx in example_seq[:20]])}")


Sequences created
  Average length (before padding): 99.8 words
  Median length: 85 words
  Max length (capped): 200 words

Example sequence (first 20 tokens):
  Original: What use to be free is now $8, is it worth it? Kind of. You'll see cars here that you've never seen ...
  Sequence: [239244, 232089, 223233, 47102, 104653, 127520, 158670, 11510, 127520, 127832, 242235, 128131, 132758, 160115, 244244, 197195, 61284, 117761, 219972, 244253]
  Decoded: what use to be free is now $8, is it worth it? kind of. you'll see cars here that you've


In [6]:
# Pad sequences and convert to tensors
print("\n" + "=" * 80)
print("PADDING AND CREATING TENSORS")
print("=" * 80)

def pad_sequences(sequences, max_len, pad_value=0):
    """
    Pad sequences to the same length.
    
    Args:
        sequences: List of sequences
        max_len: Target length
        pad_value: Value to use for padding
    
    Returns:
        Padded numpy array
    """
    padded = np.zeros((len(sequences), max_len), dtype=np.int64)
    
    for i, seq in enumerate(sequences):
        length = min(len(seq), max_len)
        padded[i, :length] = seq[:length]
    
    return padded

# Pad sequences
X_train = pad_sequences(train_sequences, MAX_LEN)
X_val = pad_sequences(val_sequences, MAX_LEN)
X_test = pad_sequences(test_sequences, MAX_LEN)

# Convert labels to numpy (subtract 1 to make 0-indexed for PyTorch)
y_train = train_df['stars'].values - 1  # Now 0-4 instead of 1-5
y_val = val_df['stars'].values - 1
y_test = test_df['stars'].values - 1

print(f"\nData prepared:")
print(f"  X_train shape: {X_train.shape}")
print(f"  X_val shape: {X_val.shape}")
print(f"  X_test shape: {X_test.shape}")
print(f"  y_train shape: {y_train.shape}")
print(f"  Label range: {y_train.min()} to {y_train.max()} (0-indexed)")

# Convert to PyTorch tensors
X_train_tensor = torch.LongTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_val_tensor = torch.LongTensor(X_val)
y_val_tensor = torch.LongTensor(y_val)
X_test_tensor = torch.LongTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

print(f"\nTensors created")
print(f"  X_train_tensor: {X_train_tensor.shape}, dtype: {X_train_tensor.dtype}")
print(f"  y_train_tensor: {y_train_tensor.shape}, dtype: {y_train_tensor.dtype}")


Data prepared:
  X_train shape: (1800000, 200)
  X_val shape: (200000, 200)
  X_test shape: (500000, 200)
  y_train shape: (1800000,)
  Label range: 0 to 4 (0-indexed)

Tensors created
  X_train_tensor: torch.Size([1800000, 200]), dtype: torch.int64
  y_train_tensor: torch.Size([1800000]), dtype: torch.int64


In [7]:
# Create DataLoaders
print("\n" + "=" * 80)
print("CREATING DATALOADERS")
print("=" * 80)

BATCH_SIZE = 1024

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\nDataLoaders created with batch_size={BATCH_SIZE}")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

# Test DataLoader
test_batch = next(iter(train_loader))
print(f"\nTest batch:")
print(f"  Input shape: {test_batch[0].shape}")
print(f"  Label shape: {test_batch[1].shape}")
print(f"  Sample input (first 5 tokens): {test_batch[0][0][:5]}")
print(f"  Sample label: {test_batch[1][0]} (corresponds to {test_batch[1][0].item() + 1} stars)")


CREATING DATALOADERS

DataLoaders created with batch_size=1024
  Train batches: 1758
  Val batches: 196
  Test batches: 489

Test batch:
  Input shape: torch.Size([1024, 200])
  Label shape: torch.Size([1024])
  Sample input (first 5 tokens): tensor([ 95435, 122476, 233874, 152914,  93023])
  Sample label: 4 (corresponds to 5 stars)


## 3. Model Architecture

In [8]:
# Bidirectional LSTM Model

In [9]:
print("=" * 80)
print("BIDIRECTIONAL LSTM MODEL")
print("=" * 80)

class BiLSTM(nn.Module):
    """
    Bidirectional LSTM model for text classification.
    
    Architecture:
    - Embedding layer
    - Bidirectional LSTM (reads sequence forwards and backwards)
    - Dropout for regularization
    - Fully connected output layer
    
    Bidirectional helps capture context from both directions.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers=2, dropout=0.5):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           batch_first=True, dropout=dropout if n_layers > 1 else 0,
                           bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        # *2 because bidirectional concatenates forward and backward hidden states
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, text):
        # text shape: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch_size, seq_len, embedding_dim]
        
        output, (hidden, cell) = self.lstm(embedded)
        # output shape: [batch_size, seq_len, hidden_dim * 2]
        # hidden shape: [n_layers * 2, batch_size, hidden_dim]
        
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden shape: [batch_size, hidden_dim * 2]
        
        hidden = self.dropout(hidden)
        output = self.fc(hidden)
        # output shape: [batch_size, output_dim]
        
        return output

print("\nBiLSTM class defined")
print("\nArchitecture: Embedding → Bidirectional LSTM → Dropout → Linear")
print("Note: Bidirectional doubles the hidden dimension")

BIDIRECTIONAL LSTM MODEL

BiLSTM class defined

Architecture: Embedding → Bidirectional LSTM → Dropout → Linear
Note: Bidirectional doubles the hidden dimension


## 4. Training Functions

In [10]:
# Training and evaluation functions
print("=" * 80)
print("TRAINING UTILITIES")
print("=" * 80)

def train_epoch(model, iterator, optimizer, criterion, device):
    """
    Train the model for one epoch.
    
    Args:
        model: PyTorch model
        iterator: DataLoader
        optimizer: Optimizer
        criterion: Loss function
        device: torch.device
    
    Returns:
        Average loss and accuracy for the epoch
    """
    model.train()
    
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in iterator:
        # Get batch data
        text, labels = batch
        text = text.to(device)
        labels = labels.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(text)
        
        # Calculate loss
        loss = criterion(predictions, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Calculate accuracy
        acc = (predictions.argmax(1) == labels).float().mean()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, device):
    """
    Evaluate the model.
    
    Args:
        model: PyTorch model
        iterator: DataLoader
        criterion: Loss function
        device: torch.device
    
    Returns:
        Average loss and accuracy
    """
    model.eval()
    
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text = text.to(device)
            labels = labels.to(device)
            
            predictions = model(text)
            loss = criterion(predictions, labels)
            acc = (predictions.argmax(1) == labels).float().mean()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def predict(model, iterator, device):
    """
    Generate predictions for all samples.
    
    Args:
        model: PyTorch model
        iterator: DataLoader
        device: torch.device
    
    Returns:
        Numpy arrays of predictions and true labels
    """
    model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text = text.to(device)
            
            predictions = model(text)
            preds = predictions.argmax(1).cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    
    return np.array(all_preds), np.array(all_labels)

def count_parameters(model):
    """Count trainable parameters in a model."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("\nTraining functions defined:")
print("  - train_epoch(): Train for one epoch")
print("  - evaluate(): Evaluate model performance")
print("  - predict(): Generate predictions")
print("  - count_parameters(): Count trainable parameters")

TRAINING UTILITIES

Training functions defined:
  - train_epoch(): Train for one epoch
  - evaluate(): Evaluate model performance
  - predict(): Generate predictions
  - count_parameters(): Count trainable parameters


## 5. Model Training

## 6. Model Evaluation

In [11]:
# Hyperparameters
print("=" * 80)
print("HYPERPARAMETERS")
print("=" * 80)

EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 5  # 5 classes (1-5 stars, 0-indexed as 0-4)
N_EPOCHS = 10
LEARNING_RATE = 0.001

hyperparams = {
    'vocab_size': vocab_size,
    'embedding_dim': EMBEDDING_DIM,
    'hidden_dim': HIDDEN_DIM,
    'output_dim': OUTPUT_DIM,
    'n_epochs': N_EPOCHS,
    'learning_rate': LEARNING_RATE,
    'batch_size': BATCH_SIZE,
    'max_len': MAX_LEN
}

print("\nModel hyperparameters:")
for key, value in hyperparams.items():
    print(f"  {key}: {value}")

# Loss function with class weights for imbalanced data
print("\nCalculating class weights for imbalanced dataset...")
class_counts = np.bincount(y_train)
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * len(class_weights)
class_weights = torch.FloatTensor(class_weights).to(device)

print(f"Class weights: {class_weights.cpu().numpy()}")
print(f"\n(Higher weight = rarer class, will be penalized more in loss)")

Class weights: [1. 1. 1. 1. 1.]

(Higher weight = rarer class, will be penalized more in loss)


In [12]:
# Train LSTM Model
print("\n" + "=" * 80)
print("TRAINING BIDIRECTIONAL LSTM")
print("=" * 80)


TRAINING BIDIRECTIONAL LSTM


In [None]:
N_EPOCHS = 10
# Initialize model
lstm_model = BiLSTM(
    vocab_size=vocab_size,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    n_layers=2,
    dropout=0.5
).to(device)

print(f"\nModel architecture:")
print(lstm_model)
print(f"\nTrainable parameters: {count_parameters(lstm_model):,}")

# Optimizer and loss
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Training loop
print(f"\nTraining for {N_EPOCHS} epochs...")
print(f"Using device: {device}\n")

lstm_train_losses = []
lstm_train_accs = []
lstm_val_losses = []
lstm_val_accs = []

best_val_loss = float('inf')
start_time = time.time()

for epoch in range(N_EPOCHS):
    epoch_start = time.time()
    
    train_loss, train_acc = train_epoch(lstm_model, train_loader, lstm_optimizer, criterion, device)
    val_loss, val_acc = evaluate(lstm_model, val_loader, criterion, device)
    
    lstm_train_losses.append(train_loss)
    lstm_train_accs.append(train_acc)
    lstm_val_losses.append(val_loss)
    lstm_val_accs.append(val_acc)
    
    epoch_time = time.time() - epoch_start
    
    print(f"Epoch {epoch+1:02d}/{N_EPOCHS} | Time: {epoch_time:.1f}s")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc*100:.2f}%")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(lstm_model.state_dict(), '../Outputs/Models/lstm_best.pt')
        print(f"   Saved new best model (val_loss: {val_loss:.4f})")
    print()

total_time = time.time() - start_time
print(f"\nTraining complete in {total_time:.1f}s ({total_time/60:.1f} minutes)")
print(f"  Best validation loss: {best_val_loss:.4f}")

# Load best model for evaluation
lstm_model.load_state_dict(torch.load('../Outputs/Models/lstm_best.pt'))
print(f"\nLoaded best model from checkpoint")

Epoch 01/10 | Time: 188.9s
  Train Loss: 1.1159 | Train Acc: 51.02%
  Val Loss:   0.9878 | Val Acc:   57.85%
   Saved new best model (val_loss: 0.9878)



In [None]:
# Evaluate model on test set
print("=" * 80)
print("MODEL EVALUATION ON TEST SET")
print("=" * 80)

# Get predictions
print("\nGenerating predictions...")
lstm_preds, y_test_np = predict(lstm_model, test_loader, device)

# Calculate metrics
accuracy = accuracy_score(y_test_np, lstm_preds)
f1_macro = f1_score(y_test_np, lstm_preds, average='macro')
f1_weighted = f1_score(y_test_np, lstm_preds, average='weighted')

print("\n" + "=" * 80)
print("TEST SET PERFORMANCE")
print("=" * 80)

print(f"\nBidirectional LSTM:")
print(f"  Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"  F1-Score (macro): {f1_macro:.4f}")
print(f"  F1-Score (weighted): {f1_weighted:.4f}")

results_summary = {
    'Model': 'Bidirectional LSTM',
    'Accuracy': accuracy,
    'F1 (Macro)': f1_macro,
    'F1 (Weighted)': f1_weighted
}
results_df = pd.DataFrame([results_summary])

In [None]:
# Visualize confusion matrix
print("\n" + "=" * 80)
print("CONFUSION MATRIX")
print("=" * 80)

fig, ax = plt.subplots(figsize=(8, 6))

cm = confusion_matrix(y_test_np, lstm_preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
           xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5],
           ax=ax, cbar_kws={'label': 'Proportion'})
ax.set_xlabel('Predicted Rating', fontweight='bold')
ax.set_ylabel('Actual Rating', fontweight='bold')
ax.set_title('Bidirectional LSTM - Confusion Matrix', fontweight='bold')

plt.tight_layout()
plt.savefig('../Outputs/Plots/12_lstm_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Plot saved: 12_lstm_confusion_matrix.png")

In [None]:
# Save all results
print("=" * 80)
print("SAVING RESULTS")
print("=" * 80)

# Define star names for classification report
star_names = ['1 Star', '2 Stars', '3 Stars', '4 Stars', '5 Stars']

# Save results CSV
results_df.to_csv('../Outputs/lstm_results.csv', index=False)
print("Saved: lstm_results.csv")

# Save vocabulary
with open('../Outputs/Models/vocab.pkl', 'wb') as f:
    pickle.dump({'word2idx': word2idx, 'idx2word': idx2word}, f)
print("Saved: vocab.pkl")

# Save hyperparameters
import json
with open('../Outputs/lstm_hyperparameters.json', 'w') as f:
    json.dump(hyperparams, f, indent=2)
print("Saved: lstm_hyperparameters.json")

# Save detailed results
detailed_results = {
    'model': 'Bidirectional LSTM',
    'accuracy': float(accuracy),
    'f1_macro': float(f1_macro),
    'f1_weighted': float(f1_weighted),
    'per_class_report': classification_report(y_test_np, lstm_preds,
                                              target_names=star_names,
                                              output_dict=True)
}

with open('../Outputs/lstm_detailed_results.json', 'w') as f:
    json.dump(detailed_results, f, indent=2)
print("Saved: lstm_detailed_results.json")

print("\n" + "="*80)
print("ALL RESULTS SAVED SUCCESSFULLY")
print("="*80)

In [None]:
# Detailed classification report
print("\n" + "=" * 80)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 80)

star_names = ['1 Star', '2 Stars', '3 Stars', '4 Stars', '5 Stars']

print(f"\nBidirectional LSTM")
print('='*80)
print(classification_report(y_test_np, lstm_preds, target_names=star_names))

## 7. Save Results

# Plot training curves
print("\n" + "=" * 80)
print("TRAINING CURVES")
print("=" * 80)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

epochs = range(1, len(lstm_train_losses) + 1)

# Loss
axes[0].plot(epochs, lstm_train_losses, 'b-', label='Training')
axes[0].plot(epochs, lstm_val_losses, 'r-', label='Validation')
axes[0].set_xlabel('Epoch', fontweight='bold')
axes[0].set_ylabel('Loss', fontweight='bold')
axes[0].set_title('Bidirectional LSTM - Loss', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(epochs, [acc*100 for acc in lstm_train_accs], 'b-', label='Training')
axes[1].plot(epochs, [acc*100 for acc in lstm_val_accs], 'r-', label='Validation')
axes[1].set_xlabel('Epoch', fontweight='bold')
axes[1].set_ylabel('Accuracy (%)', fontweight='bold')
axes[1].set_title('Bidirectional LSTM - Accuracy', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../Outputs/Plots/13_lstm_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("Plot saved: 13_lstm_training_curves.png")

In [None]:
# Plot training curves
print("\n" + "=" * 80)
print("TRAINING CURVES")
print("=" * 80)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

epochs = range(1, len(lstm_train_losses) + 1)

# Loss
axes[0].plot(epochs, lstm_train_losses, 'b-', label='Training')
axes[0].plot(epochs, lstm_val_losses, 'r-', label='Validation')
axes[0].set_xlabel('Epoch', fontweight='bold')
axes[0].set_ylabel('Loss', fontweight='bold')
axes[0].set_title('Bidirectional LSTM - Loss', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(epochs, [acc*100 for acc in lstm_train_accs], 'b-', label='Training')
axes[1].plot(epochs, [acc*100 for acc in lstm_val_accs], 'r-', label='Validation')
axes[1].set_xlabel('Epoch', fontweight='bold')
axes[1].set_ylabel('Accuracy (%)', fontweight='bold')
axes[1].set_title('Bidirectional LSTM - Accuracy', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../Outputs/Plots/13_lstm_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("Plot saved: 13_lstm_training_curves.png")