# 12 — Model Evaluation & Testing
**Author:** Ebenezer Adjartey

Covers: train/test split, k-fold CV, confusion matrix metrics, ROC/AUC, regression metrics, hyperparameter tuning, SHAP values, learning curves.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import (train_test_split, KFold, StratifiedKFold,
                                     cross_val_score, GridSearchCV, RandomizedSearchCV,
                                     learning_curve)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                               classification_report, accuracy_score,
                               precision_score, recall_score, f1_score,
                               roc_curve, roc_auc_score,
                               mean_squared_error, mean_absolute_error, r2_score)
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## 1. Train/Test Split & Cross-Validation

In [None]:
X, y = make_classification(n_samples=800, n_features=15, n_informative=8, random_state=42)

# Simple train/test split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Train: {X_tr.shape[0]}  Test: {X_te.shape[0]}')
print(f'Class balance (train): {np.bincount(y_tr)}')

# 5-fold stratified cross-validation
clf = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=StratifiedKFold(5), scoring='roc_auc')
print(f'\n5-fold CV AUC scores: {cv_scores.round(4)}')
print(f'Mean AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})')

## 2. Confusion Matrix & Classification Metrics

In [None]:
clf.fit(X_tr, y_tr)
y_pred = clf.predict(X_te)
y_prob = clf.predict_proba(X_te)[:,1]

cm = confusion_matrix(y_te, y_pred)
tn, fp, fn, tp = cm.ravel()

print('Confusion Matrix:')
print(cm)
print(f'\nTN={tn}  FP={fp}  FN={fn}  TP={tp}')
print(f'Accuracy:  {accuracy_score(y_te,y_pred):.4f}')
print(f'Precision: {precision_score(y_te,y_pred):.4f}  (TP/(TP+FP))')
print(f'Recall:    {recall_score(y_te,y_pred):.4f}   (TP/(TP+FN))')
print(f'F1-Score:  {f1_score(y_te,y_pred):.4f}   (harmonic mean of P and R)')
print(f'Specificity: {tn/(tn+fp):.4f}  (TN/(TN+FP))')
print(f'AUC:       {roc_auc_score(y_te,y_prob):.4f}')
print('\nFull Report:')
print(classification_report(y_te, y_pred))

## 3. ROC Curve

In [None]:
# Multiple classifiers
models_eval = {
    'Random Forest':   RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boost':  GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Reg':    LogisticRegression(max_iter=1000),
}

fig, ax = plt.subplots(figsize=(7, 6))
for name, m in models_eval.items():
    m.fit(X_tr, y_tr)
    probs = m.predict_proba(X_te)[:,1]
    fpr, tpr, _ = roc_curve(y_te, probs)
    auc = roc_auc_score(y_te, probs)
    ax.plot(fpr, tpr, lw=2, label=f'{name} (AUC={auc:.3f})')

ax.plot([0,1],[0,1],'k--', label='Random (AUC=0.5)')
ax.set_title('ROC Curves'); ax.set_xlabel('False Positive Rate'); ax.set_ylabel('True Positive Rate')
ax.legend(fontsize=9)
plt.tight_layout()
os.makedirs('12_model_evaluation', exist_ok=True)
plt.savefig('12_model_evaluation/roc_curves.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## 4. Regression Metrics

In [None]:
X_reg, y_reg = make_regression(n_samples=500, n_features=10, noise=30, random_state=42)
Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

ridge = Ridge(alpha=1.0)
ridge.fit(Xr_tr, yr_tr)
yr_pred = ridge.predict(Xr_te)

mae  = mean_absolute_error(yr_te, yr_pred)
rmse = mean_squared_error(yr_te, yr_pred, squared=False)
r2   = r2_score(yr_te, yr_pred)
mape = np.mean(np.abs((yr_te - yr_pred) / (yr_te + 1e-8))) * 100

print(f'MAE:  {mae:.3f}   (mean absolute error)')
print(f'RMSE: {rmse:.3f}  (root mean squared error; penalizes large errors)')
print(f'R2:   {r2:.4f}  (proportion of variance explained)')
print(f'MAPE: {mape:.2f}%  (mean absolute percentage error)')

## 5. Hyperparameter Tuning

In [None]:
# GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth':    [3, 5, 10],
    'min_samples_leaf': [1, 5]
}
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=3, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X_tr, y_tr)
print('GridSearchCV best params:', grid_search.best_params_)
print(f'Best CV AUC: {grid_search.best_score_:.4f}')

# RandomizedSearchCV (faster for large spaces)
from scipy.stats import randint
param_dist = {'n_estimators': randint(50,200), 'max_depth': randint(3,15)}
rand_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist, n_iter=10, cv=3, scoring='roc_auc', random_state=42
)
rand_search.fit(X_tr, y_tr)
print('\nRandomizedSearchCV best params:', rand_search.best_params_)
print(f'Best CV AUC: {rand_search.best_score_:.4f}')

## 6. Feature Importance & SHAP Values

In [None]:
# Feature importance from RF
best_rf = grid_search.best_estimator_
fi_df = pd.DataFrame({'Feature': [f'X{i}' for i in range(X.shape[1])],
                       'Importance': best_rf.feature_importances_})
fi_df = fi_df.sort_values('Importance', ascending=False)
print('Feature Importances:')
print(fi_df.round(4).to_string(index=False))

# SHAP values (if installed)
try:
    import shap
    explainer = shap.TreeExplainer(best_rf)
    shap_values = explainer.shap_values(X_te[:100])
    print('\nSHAP values computed (shape):', np.array(shap_values).shape)
    # SHAP summary plot
    shap.summary_plot(shap_values[1], X_te[:100],
                      feature_names=[f'X{i}' for i in range(X.shape[1])],
                      show=False)
    plt.tight_layout()
    plt.savefig('12_model_evaluation/shap_summary.png', dpi=100, bbox_inches='tight')
    plt.show(); print('SHAP plot saved.')
except ImportError:
    print('SHAP not installed (pip install shap)')

## 7. Learning Curves

In [None]:
train_sizes, train_scores, val_scores = learning_curve(
    RandomForestClassifier(n_estimators=50, random_state=42),
    X, y, cv=5, scoring='roc_auc',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
train_std  = train_scores.std(axis=1)
val_mean   = val_scores.mean(axis=1)
val_std    = val_scores.std(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Learning curve
axes[0].plot(train_sizes, train_mean, 'b-o', label='Training AUC')
axes[0].fill_between(train_sizes, train_mean-train_std, train_mean+train_std, alpha=.1, color='b')
axes[0].plot(train_sizes, val_mean, 'r-o', label='Validation AUC')
axes[0].fill_between(train_sizes, val_mean-val_std, val_mean+val_std, alpha=.1, color='r')
axes[0].set_title('Learning Curve')
axes[0].set_xlabel('Training Size'); axes[0].set_ylabel('AUC')
axes[0].legend()

# Confusion matrix heatmap
best_rf.fit(X_tr, y_tr)
cm = confusion_matrix(y_te, best_rf.predict(X_te))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_title(f'Confusion Matrix (AUC={roc_auc_score(y_te,best_rf.predict_proba(X_te)[:,1]):.3f})')
axes[1].set_xlabel('Predicted'); axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.savefig('12_model_evaluation/learning_curves.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## Key Takeaways

- **Cross-validation** gives a more honest estimate of generalization than a single train/test split
- **Precision vs Recall trade-off**: choose threshold based on the cost of FP vs FN
- **AUC**: threshold-independent measure of discrimination ability
- **GridSearchCV**: exhaustive search; RandomizedSearch is faster for large spaces
- **SHAP**: model-agnostic interpretability; shows feature impact per prediction
- **Learning curves**: diagnose overfitting (high train, low val) vs underfitting (both low)
