# Model Training and Evaluation

This notebook covers training and evaluating sentiment analysis models:
- Loading preprocessed features
- Training Logistic Regression classifier
- Training Naive Bayes classifier
- Model evaluation and comparison
- Performance metrics analysis

## Import Required Libraries

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

# Machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.model_selection import cross_val_score

# Set random seed for reproducibility
np.random.seed(42)

# Set plot style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Load Preprocessed Data and Features

In [None]:
# Load the original data for reference
df = pd.read_csv('../data/preprocessed_reviews.csv')
print(f"Original dataset shape: {df.shape}")

# Load the TF-IDF vectorizer
with open('../results/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

print(f"✓ TF-IDF vectorizer loaded")
print(f"✓ Vocabulary size: {len(tfidf_vectorizer.get_feature_names_out())}")

In [None]:
# Create features and labels
X = tfidf_vectorizer.transform(df['cleaned_review'])
y = df['sentiment'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Class distribution: {np.bincount(y)}")

# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Train labels distribution: {np.bincount(y_train)}")
print(f"Test labels distribution: {np.bincount(y_test)}")

## Train Logistic Regression Model

In [None]:
# Initialize and train Logistic Regression
print("Training Logistic Regression model...")

lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0  # Regularization strength
)

# Train the model
lr_model.fit(X_train, y_train)

print("✓ Logistic Regression model trained successfully!")
print(f"✓ Model coefficients shape: {lr_model.coef_.shape}")
print(f"✓ Model intercept: {lr_model.intercept_[0]:.4f}")

In [None]:
# Make predictions with Logistic Regression
lr_train_pred = lr_model.predict(X_train)
lr_test_pred = lr_model.predict(X_test)
lr_test_proba = lr_model.predict_proba(X_test)[:, 1]

# Calculate performance metrics
lr_train_acc = accuracy_score(y_train, lr_train_pred)
lr_test_acc = accuracy_score(y_test, lr_test_pred)
lr_precision = precision_score(y_test, lr_test_pred)
lr_recall = recall_score(y_test, lr_test_pred)
lr_f1 = f1_score(y_test, lr_test_pred)
lr_auc = roc_auc_score(y_test, lr_test_proba)

print("Logistic Regression Performance:")
print(f"Training Accuracy: {lr_train_acc:.4f}")
print(f"Test Accuracy: {lr_test_acc:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")
print(f"AUC-ROC: {lr_auc:.4f}")

## Train Naive Bayes Model

In [None]:
# Initialize and train Multinomial Naive Bayes
print("Training Naive Bayes model...")

nb_model = MultinomialNB(alpha=1.0)  # Laplace smoothing

# Train the model
nb_model.fit(X_train, y_train)

print("✓ Naive Bayes model trained successfully!")
print(f"✓ Model classes: {nb_model.classes_}")
print(f"✓ Class log priors: {nb_model.class_log_prior_}")

In [None]:
# Make predictions with Naive Bayes
nb_train_pred = nb_model.predict(X_train)
nb_test_pred = nb_model.predict(X_test)
nb_test_proba = nb_model.predict_proba(X_test)[:, 1]

# Calculate performance metrics
nb_train_acc = accuracy_score(y_train, nb_train_pred)
nb_test_acc = accuracy_score(y_test, nb_test_pred)
nb_precision = precision_score(y_test, nb_test_pred)
nb_recall = recall_score(y_test, nb_test_pred)
nb_f1 = f1_score(y_test, nb_test_pred)
nb_auc = roc_auc_score(y_test, nb_test_proba)

print("Naive Bayes Performance:")
print(f"Training Accuracy: {nb_train_acc:.4f}")
print(f"Test Accuracy: {nb_test_acc:.4f}")
print(f"Precision: {nb_precision:.4f}")
print(f"Recall: {nb_recall:.4f}")
print(f"F1-Score: {nb_f1:.4f}")
print(f"AUC-ROC: {nb_auc:.4f}")

## Model Comparison

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes'],
    'Train Accuracy': [lr_train_acc, nb_train_acc],
    'Test Accuracy': [lr_test_acc, nb_test_acc],
    'Precision': [lr_precision, nb_precision],
    'Recall': [lr_recall, nb_recall],
    'F1-Score': [lr_f1, nb_f1],
    'AUC-ROC': [lr_auc, nb_auc]
})

print("Model Comparison:")
print("=" * 80)
print(comparison_df.round(4))

# Determine best model
best_model_idx = comparison_df['F1-Score'].idxmax()
best_model = comparison_df.loc[best_model_idx, 'Model']
print(f"\nBest performing model: {best_model}")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Performance metrics comparison
metrics = ['Test Accuracy', 'Precision', 'Recall', 'F1-Score']
x_pos = np.arange(len(comparison_df))
width = 0.35

for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    bars = ax.bar(x_pos, comparison_df[metric], width)
    ax.set_title(f'{metric} Comparison')
    ax.set_xlabel('Model')
    ax.set_ylabel(metric)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(comparison_df['Model'], rotation=45)
    ax.set_ylim(0, 1.1)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Confusion Matrix and Classification Reports

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Logistic Regression confusion matrix
lr_cm = confusion_matrix(y_test, lr_test_pred)
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Logistic Regression\nConfusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_xticklabels(['Negative', 'Positive'])
axes[0].set_yticklabels(['Negative', 'Positive'])

# Naive Bayes confusion matrix
nb_cm = confusion_matrix(y_test, nb_test_pred)
sns.heatmap(nb_cm, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Naive Bayes\nConfusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_xticklabels(['Negative', 'Positive'])
axes[1].set_yticklabels(['Negative', 'Positive'])

plt.tight_layout()
plt.show()

In [None]:
# Detailed classification reports
print("Logistic Regression - Classification Report:")
print("=" * 50)
print(classification_report(y_test, lr_test_pred, target_names=['Negative', 'Positive']))

print("\nNaive Bayes - Classification Report:")
print("=" * 50)
print(classification_report(y_test, nb_test_pred, target_names=['Negative', 'Positive']))

## ROC Curves

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))

# Logistic Regression ROC
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_test_proba)
plt.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', linewidth=2)

# Naive Bayes ROC
nb_fpr, nb_tpr, _ = roc_curve(y_test, nb_test_proba)
plt.plot(nb_fpr, nb_tpr, label=f'Naive Bayes (AUC = {nb_auc:.3f})', linewidth=2)

# Random classifier line
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', alpha=0.5)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Cross-Validation

In [None]:
# Perform cross-validation
print("Performing 5-fold cross-validation...")

# Combine train and test for full cross-validation
X_full = X
y_full = y

# Logistic Regression CV
lr_cv_scores = cross_val_score(lr_model, X_full, y_full, cv=5, scoring='accuracy')
print(f"\nLogistic Regression CV Scores: {lr_cv_scores}")
print(f"Mean CV Accuracy: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std() * 2:.4f})")

# Naive Bayes CV
nb_cv_scores = cross_val_score(nb_model, X_full, y_full, cv=5, scoring='accuracy')
print(f"\nNaive Bayes CV Scores: {nb_cv_scores}")
print(f"Mean CV Accuracy: {nb_cv_scores.mean():.4f} (+/- {nb_cv_scores.std() * 2:.4f})")

## Model Interpretation - Feature Importance

In [None]:
# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Logistic Regression coefficients (feature importance)
lr_coef = lr_model.coef_[0]

# Get top positive and negative features
top_positive_idx = np.argsort(lr_coef)[-10:]  # Top 10 positive
top_negative_idx = np.argsort(lr_coef)[:10]   # Top 10 negative

print("Top 10 Positive Sentiment Features (Logistic Regression):")
for idx in reversed(top_positive_idx):
    print(f"{feature_names[idx]}: {lr_coef[idx]:.4f}")

print("\nTop 10 Negative Sentiment Features (Logistic Regression):")
for idx in top_negative_idx:
    print(f"{feature_names[idx]}: {lr_coef[idx]:.4f}")

In [None]:
# Visualize top features
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Top positive features
pos_features = [feature_names[idx] for idx in reversed(top_positive_idx)]
pos_coefs = [lr_coef[idx] for idx in reversed(top_positive_idx)]

axes[0].barh(range(len(pos_features)), pos_coefs, color='green', alpha=0.7)
axes[0].set_yticks(range(len(pos_features)))
axes[0].set_yticklabels(pos_features)
axes[0].set_xlabel('Coefficient Value')
axes[0].set_title('Top Positive Sentiment Features')
axes[0].invert_yaxis()

# Top negative features
neg_features = [feature_names[idx] for idx in top_negative_idx]
neg_coefs = [lr_coef[idx] for idx in top_negative_idx]

axes[1].barh(range(len(neg_features)), neg_coefs, color='red', alpha=0.7)
axes[1].set_yticks(range(len(neg_features)))
axes[1].set_yticklabels(neg_features)
axes[1].set_xlabel('Coefficient Value')
axes[1].set_title('Top Negative Sentiment Features')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## Save Trained Models

In [None]:
# Save the trained models
models_dir = '../results'
os.makedirs(models_dir, exist_ok=True)

# Save Logistic Regression model
with open(os.path.join(models_dir, 'logistic_regression_model.pkl'), 'wb') as f:
    pickle.dump(lr_model, f)

# Save Naive Bayes model
with open(os.path.join(models_dir, 'naive_bayes_model.pkl'), 'wb') as f:
    pickle.dump(nb_model, f)

# Save performance metrics
results = {
    'comparison_df': comparison_df,
    'best_model': best_model,
    'logistic_regression': {
        'train_acc': lr_train_acc,
        'test_acc': lr_test_acc,
        'precision': lr_precision,
        'recall': lr_recall,
        'f1': lr_f1,
        'auc': lr_auc,
        'cv_scores': lr_cv_scores
    },
    'naive_bayes': {
        'train_acc': nb_train_acc,
        'test_acc': nb_test_acc,
        'precision': nb_precision,
        'recall': nb_recall,
        'f1': nb_f1,
        'auc': nb_auc,
        'cv_scores': nb_cv_scores
    }
}

with open(os.path.join(models_dir, 'model_results.pkl'), 'wb') as f:
    pickle.dump(results, f)

print("✓ Models and results saved successfully!")
print(f"✓ Logistic Regression model saved")
print(f"✓ Naive Bayes model saved")
print(f"✓ Performance results saved")

In [None]:
# Summary of model training step
print("\n=== MODEL TRAINING COMPLETED ===")
print("✓ Logistic Regression model trained and evaluated")
print("✓ Naive Bayes model trained and evaluated")
print("✓ Model comparison performed")
print("✓ Cross-validation completed")
print("✓ Feature importance analysis done")
print("✓ All models and results saved")
print(f"\n🏆 Best performing model: {best_model}")
print(f"🎯 Best F1-Score: {comparison_df['F1-Score'].max():.4f}")
print("\nNext: Visualization and word clouds for bonus features")