In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
from sklearn.metrics import (
    confusion_matrix, classification_report,
    roc_auc_score, roc_curve, accuracy_score
)
import warnings
warnings.filterwarnings("ignore")

# Visualization style
plt.style.use('seaborn-v0_8')

# Paths
data_dir = Path("data/processed")
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)


In [None]:
# Load preprocessed feature dataset (from feature_engineering.ipynb)
X = pd.read_parquet(data_dir / "X_features.parquet")
y = pd.read_parquet(data_dir / "y_labels.parquet")

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

X.head()


In [None]:
# Load trained models from feature_engineering.ipynb
rf_model = joblib.load(models_dir / "random_forest_model.joblib")
xgb_model = joblib.load(models_dir / "xgboost_model.joblib")
dt_model = joblib.load(models_dir / "decision_tree_model.joblib")
lr_model = joblib.load(models_dir / "linear_regression_model.joblib")

models = {
    "Random Forest": rf_model,
    "XGBoost": xgb_model,
    "Decision Tree": dt_model,
    "Linear Regression": lr_model
}

print("Models loaded successfully!")


In [None]:
# Evaluate models and store metrics
results = []

for name, model in models.items():
    y_pred = model.predict(X)
    if hasattr(model, "predict_proba"):  # Tree-based models
        y_prob = model.predict_proba(X)[:,1]
    else:  # Linear Regression or models without predict_proba
        y_prob = y_pred

    acc = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_prob)
    
    print(f"\n{name} Performance:")
    print(classification_report(y, y_pred, target_names=["Dismiss", "Report"]))
    print(f"Accuracy: {acc:.3f}")
    print(f"ROC AUC: {auc:.3f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=["Pred Dismiss", "Pred Report"],
                yticklabels=["True Dismiss", "True Report"])
    plt.title(f"{name} Confusion Matrix")
    plt.show()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y, y_prob)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{name} ROC Curve")
    plt.legend()
    plt.show()
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "ROC_AUC": auc
    })


In [None]:
# Summarize all model metrics
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="ROC_AUC", ascending=False).reset_index(drop=True)
print("Model Performance Summary:")
results_df


In [None]:
# Feature importance for Random Forest and XGBoost
for name, model in [("Random Forest", rf_model), ("XGBoost", xgb_model)]:
    if hasattr(model, "feature_importances_"):
        fi = pd.DataFrame({
            "Feature": X.columns,
            "Importance": model.feature_importances_
        }).sort_values(by="Importance", ascending=False)
        
        plt.figure(figsize=(8,6))
        sns.barplot(x="Importance", y="Feature", data=fi)
        plt.title(f"{name} Feature Importance")
        plt.show()
