# LSTM Model Training for Amazon Sentiment Analysis

This notebook demonstrates the complete process of building, training, and evaluating an LSTM model for sentiment analysis of Amazon product reviews.

## Objectives:
1. Prepare data for LSTM training
2. Build and configure LSTM architecture
3. Train the model with proper validation
4. Evaluate model performance
5. Compare LSTM predictions with VADER sentiment
6. Save trained model for production use


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Add src to path
import sys
sys.path.append('../src')

from data_processor import ReviewDataProcessor, get_data_stats
from model_trainer import LSTMSentimentTrainer, evaluate_model
from analysis_engine import SentimentAnalysisEngine

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Using GPU: {len(tf.config.experimental.list_physical_devices('GPU')) > 0}")

## 1. Data Loading and Preprocessing

In [None]:
# Load the raw dataset
print("Loading dataset...")
df = pd.read_csv('../data/raw_reviews.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Display basic statistics
print("\nDataset Statistics:")
stats = get_data_stats(df)
for key, value in stats.items():
    print(f"  {key}: {value}")

# Show rating distribution
print("\nRating Distribution:")
print(df['rating'].value_counts().sort_index())

In [None]:
# Initialize data processor
processor = ReviewDataProcessor(max_vocab_size=10000, max_sequence_length=200)

# Prepare data for training
print("Preparing data for training...")
X_train, X_test, y_train, y_test = processor.prepare_data(df, test_size=0.2, random_state=42)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Vocabulary size: {len(processor.word_index)}")
print(f"Max sequence length: {processor.max_sequence_length}")

# Show class distribution
print(f"\nClass distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  Class {int(label)} ({'Positive' if label == 1 else 'Negative'}): {count} ({count/len(y_train)*100:.1f}%)")

In [None]:
# Visualize data preparation results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Training set class distribution
ax1.bar(['Negative', 'Positive'], counts, color=['red', 'green'], alpha=0.7)
ax1.set_title('Training Set Class Distribution', fontsize=14, fontweight='bold')
ax1.set_ylabel('Number of Samples')
ax1.grid(True, alpha=0.3)

# Sequence length distribution
sequence_lengths = np.sum(X_train > 0, axis=1)  # Count non-zero (non-padded) tokens
ax2.hist(sequence_lengths, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
ax2.set_title('Sequence Length Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Sequence Length')
ax2.set_ylabel('Frequency')
ax2.axvline(sequence_lengths.mean(), color='red', linestyle='--', 
           label=f'Mean: {sequence_lengths.mean():.0f}')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Vocabulary coverage
word_counts = list(processor.word_index.values())
ax3.hist(word_counts[:1000], bins=50, alpha=0.7, color='orange', edgecolor='black')
ax3.set_title('Vocabulary Index Distribution (Top 1000)', fontsize=14, fontweight='bold')
ax3.set_xlabel('Word Index')
ax3.set_ylabel('Frequency')
ax3.grid(True, alpha=0.3)

# Sample of tokenized sequences
sample_lengths = sequence_lengths[:100]
ax4.plot(sample_lengths, 'b-', alpha=0.7)
ax4.set_title('Sample Sequence Lengths (First 100)', fontsize=14, fontweight='bold')
ax4.set_xlabel('Sample Index')
ax4.set_ylabel('Sequence Length')
ax4.axhline(processor.max_sequence_length, color='red', linestyle='--', 
           label=f'Max Length: {processor.max_sequence_length}')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Average sequence length: {sequence_lengths.mean():.1f} tokens")
print(f"95th percentile: {np.percentile(sequence_lengths, 95):.0f} tokens")
print(f"Sequences using full length: {np.sum(sequence_lengths == processor.max_sequence_length)} ({np.sum(sequence_lengths == processor.max_sequence_length)/len(sequence_lengths)*100:.1f}%)")

## 2. LSTM Model Architecture Design

In [None]:
# Initialize LSTM trainer
vocab_size = len(processor.word_index)
embedding_dim = 100
lstm_units = 128
max_sequence_length = processor.max_sequence_length

trainer = LSTMSentimentTrainer(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    lstm_units=lstm_units,
    max_sequence_length=max_sequence_length
)

print("LSTM Trainer initialized with:")
print(f"  Vocabulary size: {vocab_size:,}")
print(f"  Embedding dimension: {embedding_dim}")
print(f"  LSTM units: {lstm_units}")
print(f"  Max sequence length: {max_sequence_length}")

In [None]:
# Build the LSTM model
print("Building LSTM model architecture...")

model = trainer.build_model(
    dropout_rate=0.3,
    recurrent_dropout=0.3,
    l2_reg=0.01,
    bidirectional=True
)

print(f"\nModel has {model.count_params():,} trainable parameters")

# Display model architecture
print("\nModel Summary:")
model.summary()

In [None]:
# Visualize model architecture
try:
    plot_model(model, to_file='../models/model_architecture.png', 
               show_shapes=True, show_layer_names=True, dpi=150)
    from IPython.display import Image
    display(Image('../models/model_architecture.png'))
except Exception as e:
    print(f"Could not display model plot: {e}")
    print("Model architecture visualization saved to ../models/model_architecture.png")

## 3. Model Training

In [None]:
# Train the model
print("Starting model training...")
print("This may take several minutes depending on your hardware.")

# Training parameters
epochs = 25  # Reduced for demo, increase to 50+ for production
batch_size = 32
validation_split = 0.1

# Train the model
history = trainer.train_model(
    X_train, y_train, X_test, y_test,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

print("\nTraining completed!")
print(f"Best validation accuracy: {max(history['val_accuracy']):.4f}")
print(f"Final test accuracy: {history['test_accuracy']:.4f}")

## 4. Training History Analysis

In [None]:
# Plot training history
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Training & Validation Loss
epochs_range = range(1, len(history['loss']) + 1)
ax1.plot(epochs_range, history['loss'], 'b-', label='Training Loss', alpha=0.8)
ax1.plot(epochs_range, history['val_loss'], 'r-', label='Validation Loss', alpha=0.8)
ax1.set_title('Model Loss', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Training & Validation Accuracy
ax2.plot(epochs_range, history['accuracy'], 'b-', label='Training Accuracy', alpha=0.8)
ax2.plot(epochs_range, history['val_accuracy'], 'r-', label='Validation Accuracy', alpha=0.8)
ax2.set_title('Model Accuracy', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Precision and Recall
ax3.plot(epochs_range, history['precision'], 'g-', label='Training Precision', alpha=0.8)
ax3.plot(epochs_range, history['val_precision'], 'orange', label='Validation Precision', alpha=0.8)
ax3.plot(epochs_range, history['recall'], 'purple', label='Training Recall', alpha=0.8)
ax3.plot(epochs_range, history['val_recall'], 'brown', label='Validation Recall', alpha=0.8)
ax3.set_title('Precision and Recall', fontsize=14, fontweight='bold')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Score')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Learning rate (if available)
if 'lr' in history:
    ax4.plot(epochs_range, history['lr'], 'g-', alpha=0.8)
    ax4.set_title('Learning Rate Schedule', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Epoch')
    ax4.set_ylabel('Learning Rate')
    ax4.set_yscale('log')
    ax4.grid(True, alpha=0.3)
else:
    # Show final metrics instead
    metrics = ['Test Loss', 'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1']
    values = [history['test_loss'], history['test_accuracy'], 
              history['test_precision'], history['test_recall'], history['test_f1']]
    
    bars = ax4.bar(metrics, values, color=['red', 'green', 'blue', 'orange', 'purple'], alpha=0.7)
    ax4.set_title('Final Test Metrics', fontsize=14, fontweight='bold')
    ax4.set_ylabel('Score')
    ax4.set_ylim(0, 1)
    ax4.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height,
                f'{value:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print final metrics
print("\nFinal Model Performance:")
print(f"  Test Loss: {history['test_loss']:.4f}")
print(f"  Test Accuracy: {history['test_accuracy']:.4f}")
print(f"  Test Precision: {history['test_precision']:.4f}")
print(f"  Test Recall: {history['test_recall']:.4f}")
print(f"  Test F1-Score: {history['test_f1']:.4f}")

## 5. Model Evaluation and Analysis

In [None]:
# Generate predictions on test set
print("Evaluating model on test set...")
y_pred_prob = trainer.predict(X_test)
y_pred = (y_pred_prob >= 0.5).astype(int)

# Calculate comprehensive metrics
metrics = evaluate_model(trainer.model, X_test, y_test)

print("\nComprehensive Model Evaluation:")
print(f"  Accuracy: {metrics['accuracy']:.4f}")
print(f"  Precision: {metrics['precision']:.4f}")
print(f"  Recall: {metrics['recall']:.4f}")
print(f"  F1-Score: {metrics['f1_score']:.4f}")
print(f"\nConfusion Matrix:")
print(f"  True Negatives: {metrics['true_negatives']}")
print(f"  False Positives: {metrics['false_positives']}")
print(f"  False Negatives: {metrics['false_negatives']}")
print(f"  True Positives: {metrics['true_positives']}")

In [None]:
# Visualize model evaluation
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Confusion Matrix
cm = np.array(metrics['confusion_matrix'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
ax1.set_title('Confusion Matrix', fontsize=14, fontweight='bold')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.8)
ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.05])
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('ROC Curve', fontsize=14, fontweight='bold')
ax2.legend(loc="lower right")
ax2.grid(True, alpha=0.3)

# Prediction probability distribution
ax3.hist(y_pred_prob[y_test == 0], bins=50, alpha=0.5, label='Negative', color='red', density=True)
ax3.hist(y_pred_prob[y_test == 1], bins=50, alpha=0.5, label='Positive', color='green', density=True)
ax3.axvline(0.5, color='black', linestyle='--', alpha=0.8, label='Decision Threshold')
ax3.set_title('Prediction Probability Distribution', fontsize=14, fontweight='bold')
ax3.set_xlabel('Prediction Probability')
ax3.set_ylabel('Density')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Model confidence analysis
confidence = np.abs(y_pred_prob - 0.5) * 2  # Scale to 0-1
correct_predictions = (y_pred == y_test)

ax4.scatter(confidence[correct_predictions], y_pred_prob[correct_predictions], 
           alpha=0.6, color='green', label='Correct', s=20)
ax4.scatter(confidence[~correct_predictions], y_pred_prob[~correct_predictions], 
           alpha=0.6, color='red', label='Incorrect', s=20)
ax4.set_xlabel('Prediction Confidence')
ax4.set_ylabel('Prediction Probability')
ax4.set_title('Prediction Confidence vs Accuracy', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nROC AUC Score: {roc_auc:.4f}")
print(f"Average Prediction Confidence: {confidence.mean():.4f}")
print(f"Confidence for Correct Predictions: {confidence[correct_predictions].mean():.4f}")
print(f"Confidence for Incorrect Predictions: {confidence[~correct_predictions].mean():.4f}")

## 6. Detailed Classification Analysis

In [None]:
# Print detailed classification report
print("Detailed Classification Report:")
print("=" * 50)
target_names = ['Negative', 'Positive']
print(classification_report(y_test, y_pred, target_names=target_names))

# Analyze prediction errors
print("\nError Analysis:")
print("=" * 30)

# False Positives (predicted positive, actually negative)
false_positives = (y_pred == 1) & (y_test == 0)
print(f"False Positives: {np.sum(false_positives)} ({np.sum(false_positives)/len(y_test)*100:.1f}%)")

# False Negatives (predicted negative, actually positive)  
false_negatives = (y_pred == 0) & (y_test == 1)
print(f"False Negatives: {np.sum(false_negatives)} ({np.sum(false_negatives)/len(y_test)*100:.1f}%)")

# High confidence errors
high_conf_errors = ~correct_predictions & (confidence > 0.7)
print(f"High Confidence Errors: {np.sum(high_conf_errors)} ({np.sum(high_conf_errors)/len(y_test)*100:.1f}%)")

# Low confidence correct predictions
low_conf_correct = correct_predictions & (confidence < 0.3)
print(f"Low Confidence Correct: {np.sum(low_conf_correct)} ({np.sum(low_conf_correct)/len(y_test)*100:.1f}%)")

## 7. Save Trained Model

In [None]:
# Save the trained model and tokenizer
model_path = '../models/lstm_sentiment_model.h5'
tokenizer_path = '../models/tokenizer.pkl'

print("Saving trained model and tokenizer...")

# Save model
trainer.save_model(model_path)

# Save tokenizer
processor.save_tokenizer(tokenizer_path)

print(f"Model saved to: {model_path}")
print(f"Tokenizer saved to: {tokenizer_path}")
print("\nModel and tokenizer are ready for production use!")

## 8. Model Testing with Sample Predictions

In [None]:
# Test the model with sample reviews
print("Testing model with sample reviews:")
print("=" * 50)

# Sample reviews for testing
test_reviews = [
    "This product is absolutely amazing! Great quality and fast shipping. Highly recommend!",
    "Terrible product. Broke after one day. Complete waste of money. Very disappointed.",
    "It's okay. Does what it's supposed to do but nothing special. Average quality.",
    "Outstanding service and excellent product quality. Will definitely buy again!",
    "Poor quality control and awful customer service. Avoid at all costs!"
]

expected_sentiments = ['Positive', 'Negative', 'Neutral', 'Positive', 'Negative']

for i, (review, expected) in enumerate(zip(test_reviews, expected_sentiments)):
    # Preprocess the review
    X_sample = processor.preprocess_single_text(review)
    
    # Get prediction
    prob = trainer.predict(X_sample)[0]
    prediction = 'Positive' if prob >= 0.5 else 'Negative'
    confidence = abs(prob - 0.5) * 2
    
    print(f"\nSample {i+1}:")
    print(f"Text: {review}")
    print(f"Expected: {expected}")
    print(f"Predicted: {prediction} (probability: {prob:.3f}, confidence: {confidence:.3f})")
    print(f"Correct: {'✓' if prediction == expected or (expected == 'Neutral' and 0.3 <= prob <= 0.7) else '✗'}")

## 9. Compare LSTM with VADER Sentiment Analysis

In [None]:
# Initialize analysis engine with our trained model
print("Comparing LSTM with VADER sentiment analysis...")

try:
    engine = SentimentAnalysisEngine(model_path, tokenizer_path)
    print("Analysis engine loaded successfully!")
    
    # Test comparison with sample reviews
    print("\nLSTM vs VADER Comparison:")
    print("=" * 50)
    
    for i, review in enumerate(test_reviews[:3]):
        # Get comprehensive analysis
        analysis = engine.comprehensive_analysis(review)
        
        print(f"\nSample {i+1}:")
        print(f"Text: {review[:80]}..." if len(review) > 80 else f"Text: {review}")
        print(f"VADER: {analysis['vader_sentiment']} (score: {analysis['vader_compound']:.3f})")
        print(f"LSTM: {analysis['lstm_sentiment']} (prob: {analysis['lstm_probability']:.3f})")
        print(f"Agreement: {'✓' if analysis['sentiment_agreement'] else '✗'}")
        
except Exception as e:
    print(f"Could not load analysis engine: {e}")
    print("This is normal if running in a limited environment.")

## 10. Summary and Next Steps

In [None]:
# Generate comprehensive summary
print("LSTM MODEL TRAINING SUMMARY")
print("=" * 50)

print(f"\n📊 DATASET:")
print(f"  • Total samples: {len(X_train) + len(X_test):,}")
print(f"  • Training samples: {len(X_train):,}")
print(f"  • Test samples: {len(X_test):,}")
print(f"  • Vocabulary size: {vocab_size:,} words")
print(f"  • Average sequence length: {sequence_lengths.mean():.1f} tokens")

print(f"\n🏗️ MODEL ARCHITECTURE:")
print(f"  • Model type: Bidirectional LSTM")
print(f"  • Embedding dimension: {embedding_dim}")
print(f"  • LSTM units: {lstm_units}")
print(f"  • Total parameters: {model.count_params():,}")
print(f"  • Dropout rate: 30%")
print(f"  • L2 regularization: 0.01")

print(f"\n📈 TRAINING RESULTS:")
print(f"  • Training epochs: {len(history['loss'])}")
print(f"  • Final training accuracy: {history['accuracy'][-1]:.4f}")
print(f"  • Best validation accuracy: {max(history['val_accuracy']):.4f}")
print(f"  • Final test accuracy: {history['test_accuracy']:.4f}")

print(f"\n🎯 MODEL PERFORMANCE:")
print(f"  • Test Accuracy: {metrics['accuracy']:.4f}")
print(f"  • Test Precision: {metrics['precision']:.4f}")
print(f"  • Test Recall: {metrics['recall']:.4f}")
print(f"  • Test F1-Score: {metrics['f1_score']:.4f}")
print(f"  • ROC AUC: {roc_auc:.4f}")

print(f"\n📁 SAVED ARTIFACTS:")
print(f"  • Model: {model_path}")
print(f"  • Tokenizer: {tokenizer_path}")
print(f"  • Architecture diagram: ../models/model_architecture.png")

print(f"\n🚀 NEXT STEPS:")
print(f"  1. Use the trained model for production sentiment analysis")
print(f"  2. Run comprehensive analysis with: python main.py analyze")
print(f"  3. Compare LSTM predictions with VADER sentiment")
print(f"  4. Generate business insights from sentiment-rating discrepancies")
print(f"  5. Consider model improvements: hyperparameter tuning, ensemble methods")

print(f"\n" + "=" * 50)
print(f"🎉 MODEL TRAINING COMPLETED SUCCESSFULLY!")
print(f"The model is ready for production use and analysis.")

In [None]:
# Final model validation
print("\nFinal Model Validation:")
print("-" * 30)

# Check if model files exist and are accessible
import os

files_to_check = [model_path, tokenizer_path]
for file_path in files_to_check:
    if os.path.exists(file_path):
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"✓ {file_path} ({size_mb:.2f} MB)")
    else:
        print(f"✗ {file_path} - Not found")

# Performance validation
if metrics['accuracy'] > 0.85:
    print(f"\n✓ Model performance is excellent (Accuracy: {metrics['accuracy']:.3f})")
elif metrics['accuracy'] > 0.80:
    print(f"\n✓ Model performance is good (Accuracy: {metrics['accuracy']:.3f})")
else:
    print(f"\n⚠ Model performance could be improved (Accuracy: {metrics['accuracy']:.3f})")

print(f"\n🔄 Ready for deployment and business analysis!")