In [1]:
"""
Model Training and Evaluation Notebook
Train and evaluate the spam detection model
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import cross_val_score, learning_curve
import joblib
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path().parent / "src"))

from data_preprocessing import SMSPreprocessor
from feature_engineering import FeatureEngineer
from model_training import SpamClassifier
import config

# Load and preprocess data
preprocessor = SMSPreprocessor()
df = preprocessor.load_and_preprocess_data(config.SPAM_DATA_FILE)

# Split data
X_train, X_test, y_train, y_test = preprocessor.split_data(df)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training spam ratio: {y_train.mean():.3f}")
print(f"Test spam ratio: {y_test.mean():.3f}")

# Feature engineering
feature_engineer = FeatureEngineer()
X_train_vec = feature_engineer.fit_transform(X_train)
X_test_vec = feature_engineer.transform(X_test)

print(f"\nFeature matrix shape: {X_train_vec.shape}")
print(f"Number of features: {X_train_vec.shape[1]}")

# Train model
classifier = SpamClassifier()
classifier.train(X_train_vec, y_train)

# Evaluate model
results = classifier.evaluate(X_test_vec, y_test)

print(f"\nModel Performance:")
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, results['predictions']))

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Confusion Matrix
cm = results['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
axes[0, 0].set_title('Confusion Matrix')
axes[0, 0].set_xlabel('Predicted')
axes[0, 0].set_ylabel('Actual')

# 2. ROC Curve
y_proba = classifier.predict_proba(X_test_vec)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

axes[0, 1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
axes[0, 1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[0, 1].set_xlim([0.0, 1.0])
axes[0, 1].set_ylim([0.0, 1.05])
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curve')
axes[0, 1].legend(loc="lower right")

# 3. Feature Importance (Top 20 features)
feature_names = feature_engineer.get_feature_names()
feature_importance = np.abs(classifier.model.feature_log_prob_[1] - classifier.model.feature_log_prob_[0])
top_indices = np.argsort(feature_importance)[-20:]

axes[1, 0].barh(range(20), feature_importance[top_indices])
axes[1, 0].set_yticks(range(20))
axes[1, 0].set_yticklabels([feature_names[i] for i in top_indices])
axes[1, 0].set_xlabel('Feature Importance')
axes[1, 0].set_title('Top 20 Most Important Features')

# 4. Cross-validation scores
cv_scores = cross_val_score(classifier.model, X_train_vec, y_train, cv=5)
axes[1, 1].bar(range(1, 6), cv_scores)
axes[1, 1].set_xlabel('Fold')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].set_title(f'Cross-Validation Scores (Mean: {cv_scores.mean():.3f})')
axes[1, 1].set_ylim([0.9, 1.0])

plt.tight_layout()
plt.show()

# Learning curve
train_sizes, train_scores, val_scores = learning_curve(
    classifier.model, X_train_vec, y_train, 
    cv=5, train_sizes=np.linspace(0.1, 1.0, 10)
)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training score')
plt.plot(train_sizes, np.mean(val_scores, axis=1), 'o-', label='Validation score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()

# Error analysis
print("\nError Analysis:")
print("=" * 50)

# Get misclassified examples
y_pred = results['predictions']
misclassified = X_test[y_test != y_pred]
misclassified_labels = y_test[y_test != y_pred]
misclassified_predictions = y_pred[y_test != y_pred]

print(f"Total misclassified: {len(misclassified)}")
print(f"False positives (Ham predicted as Spam): {sum((misclassified_labels == 0) & (misclassified_predictions == 1))}")
print(f"False negatives (Spam predicted as Ham): {sum((misclassified_labels == 1) & (misclassified_predictions == 0))}")

# Show some misclassified examples
print("\nSome misclassified examples:")
for i, (text, true_label, pred_label) in enumerate(zip(misclassified.head(5), 
                                                       misclassified_labels.head(5), 
                                                       misclassified_predictions.head(5))):
    label_map = {0: 'Ham', 1: 'Spam'}
    print(f"\n{i+1}. Text: {text}")
    print(f"   True: {label_map[true_label]}, Predicted: {label_map[pred_label]}")

ModuleNotFoundError: No module named 'data_preprocessing'