In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
from sklearn.metrics import (
    confusion_matrix, classification_report,
    roc_auc_score, roc_curve, accuracy_score
)
import warnings
warnings.filterwarnings("ignore")

# Visualization style
plt.style.use('seaborn-v0_8')

# Paths
data_dir = Path("data/processed")
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)

# Machine learning
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve, auc as auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, accuracy_score, mean_squared_error, r2_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
import xgboost as xgb



In [None]:
# CELL Feature Importance & SHAP
import shap
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

#Train a quick Random Forest (for feature importance)
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

#Feature importance (tree-based)
importances = rf_model.feature_importances_
feature_names = X_train.columns
feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_imp = feat_imp.sort_values(by='Importance', ascending=False)

#Plot top 20 features
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feat_imp.head(20), palette='viridis')
plt.title("Top 20 Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()

#SHAP Analysis (explain individual predictions)
#Use TreeExplainer for RandomForest or XGBoost
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train)

#Summary plot (global feature importance)
shap.summary_plot(shap_values[1], X_train, plot_type="bar", max_display=20)

#Beeswarm plot (impact of each feature on model output)
shap.summary_plot(shap_values[1], X_train, plot_type="dot")

print("Cell complete: Feature importance and SHAP analysis generated.")


In [None]:
#Threshold Tuning & Model Calibration

from sklearn.metrics import precision_recall_curve, f1_score
import matplotlib.pyplot as plt
import numpy as np

# Use trained Random Forest or XGBoost model
model_to_tune = rf_model  # or xgb_model if trained
y_probs = model_to_tune.predict_proba(X_test)[:,1]


#Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Compute F1 for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)

# Find threshold with highest F1
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Optimal threshold for max F1: {best_threshold:.3f} (F1={f1_scores[best_idx]:.3f})")

#Plot Precision-Recall vs Threshold
plt.figure(figsize=(8,5))
plt.plot(thresholds, precision[:-1], label="Precision", color='b')
plt.plot(thresholds, recall[:-1], label="Recall", color='r')
plt.plot(thresholds, f1_scores[:-1], label="F1 Score", color='g')
plt.axvline(x=best_threshold, color='k', linestyle='--', label=f"Best Threshold={best_threshold:.3f}")
plt.xlabel("Probability Threshold")
plt.ylabel("Score")
plt.title("Precision, Recall & F1 vs Threshold")
plt.legend()
plt.grid(True)
plt.show()

#Apply new threshold to predictions
y_pred_calibrated = (y_probs >= best_threshold).astype(int)

#Evaluate calibrated predictions
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

print("Classification Report (Calibrated Threshold):")
print(classification_report(y_test, y_pred_calibrated, target_names=["Dismiss", "Report"]))

cm = confusion_matrix(y_test, y_pred_calibrated)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Confusion Matrix (Calibrated Threshold)")
plt.show()

print(f"Accuracy: {accuracy_score(y_test, y_pred_calibrated):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_probs):.3f}")

print("Cell complete: Model calibrated with optimal threshold for AML use case.")


In [None]:
# Load preprocessed feature dataset (from feature_engineering.ipynb)
X = pd.read_parquet(data_dir / "X_features.parquet")
y = pd.read_parquet(data_dir / "y_labels.parquet")

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

X.head()


In [None]:
# Load trained models from feature_engineering.ipynb
rf_model = joblib.load(models_dir / "random_forest_model.joblib")
xgb_model = joblib.load(models_dir / "xgboost_model.joblib")
dt_model = joblib.load(models_dir / "decision_tree_model.joblib")
lr_model = joblib.load(models_dir / "linear_regression_model.joblib")

models = {
    "Random Forest": rf_model,
    "XGBoost": xgb_model,
    "Decision Tree": dt_model,
    "Linear Regression": lr_model
}

print("Models loaded successfully!")


In [None]:
# Evaluate models and store metrics
results = []

for name, model in models.items():
    y_pred = model.predict(X)
    if hasattr(model, "predict_proba"):  # Tree-based models
        y_prob = model.predict_proba(X)[:,1]
    else:  # Linear Regression or models without predict_proba
        y_prob = y_pred

    acc = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_prob)
    
    print(f"\n{name} Performance:")
    print(classification_report(y, y_pred, target_names=["Dismiss", "Report"]))
    print(f"Accuracy: {acc:.3f}")
    print(f"ROC AUC: {auc:.3f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=["Pred Dismiss", "Pred Report"],
                yticklabels=["True Dismiss", "True Report"])
    plt.title(f"{name} Confusion Matrix")
    plt.show()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y, y_prob)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{name} ROC Curve")
    plt.legend()
    plt.show()
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "ROC_AUC": auc
    })


In [None]:
# Summarize all model metrics
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="ROC_AUC", ascending=False).reset_index(drop=True)
print("Model Performance Summary:")
results_df


In [None]:
# Feature importance for Random Forest and XGBoost
for name, model in [("Random Forest", rf_model), ("XGBoost", xgb_model)]:
    if hasattr(model, "feature_importances_"):
        fi = pd.DataFrame({
            "Feature": X.columns,
            "Importance": model.feature_importances_
        }).sort_values(by="Importance", ascending=False)
        
        plt.figure(figsize=(8,6))
        sns.barplot(x="Importance", y="Feature", data=fi)
        plt.title(f"{name} Feature Importance")
        plt.show()


In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:,1]

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Random Forest Confusion Matrix")
plt.show()


In [None]:
# --- XGBoost Classifier ---
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt

# Use scaled data if already prepared (X_train_scaled, X_test_scaled)
# Otherwise, fall back to X_train and X_test
X_train_final = X_train_scaled if 'X_train_scaled' in locals() else X_train
X_test_final = X_test_scaled if 'X_test_scaled' in locals() else X_test

xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

xgb_model.fit(X_train_final, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test_final)
y_prob_xgb = xgb_model.predict_proba(X_test_final)[:, 1]

# Evaluation
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=["Dismiss", "Report"]))
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob_xgb):.3f}")

# Confusion Matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:,1]

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("ROC AUC:", roc_auc_score(y_test, y_prob_dt))

# Confusion Matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Oranges', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Decision Tree Confusion Matrix")
plt.show()


In [None]:
# Linear Regression (predict probability of being Report)
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

y_pred_lr = lr_model.predict(X_test_scaled)
# Convert probabilities to labels
y_pred_lr_label = (y_pred_lr > 0.5).astype(int)

print("Linear Regression Evaluation:")
print(classification_report(y_test, y_pred_lr_label))
print("Accuracy:", accuracy_score(y_test, y_pred_lr_label))
print("ROC AUC:", roc_auc_score(y_test, y_pred_lr))

# Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr_label)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Purples', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Linear Regression Confusion Matrix")
plt.show()
