# Cross-Validation and Overfitting Mitigation Techniques

## Objectives
1. Implement K-fold cross-validation for robust performance estimates
2. Apply regularization techniques to prevent overfitting
3. Use learning curves to diagnose model behavior
4. Implement early stopping for gradient boosting models
5. Compare performance across different mitigation strategies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split, cross_val_score, StratifiedKFold,
    learning_curve, validation_curve
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import (
    roc_auc_score, average_precision_score, make_scorer,
    precision_score, recall_score, f1_score
)
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Prepare Data (Corrected Features - No Leakage)

In [3]:
# Load data
df = pd.read_csv('data/clinical_genotype_HGB.csv')

# Use corrected features (no data leakage)
exclude_features = [
    'wihsid', 'bsdate', 'bsvisit', 'dob', 'date',
    'lnegdate', 'fposdate', 'frstartd', 'frstaidd', 'frstdthd',
    'undetectable', 'HIV', 'r',
    'vload', 'logvl', 'vla', 'cd8a',  # Leaking features removed
    'status', 'n', 'N', 'visit'
]

feature_cols = [col for col in df.columns if col not in exclude_features]

X = df[feature_cols].copy()
y = df['undetectable'].copy()

# Remove missing targets
mask = y.notna()
X = X[mask]
y = y[mask].astype(int)

print(f"Dataset shape: {X.shape}")
print(f"Class distribution: {np.bincount(y)}")

Dataset shape: (33011, 27)
Class distribution: [21947 11064]


In [4]:
# Encode categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str).replace('nan', 'MISSING'))

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

print(f"Data preprocessed: {X_scaled.shape}")

Data preprocessed: (33011, 27)


## 2. K-Fold Cross-Validation

Cross-validation provides more robust performance estimates by:
- Using all data for both training and validation
- Reducing variance in performance estimates
- Detecting overfitting through train/validation gaps

In [5]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

print("Performing 5-Fold Stratified Cross-Validation...")
print("="*60)

Performing 5-Fold Stratified Cross-Validation...


In [6]:
# Define models with different regularization levels
models = {
    'Logistic Regression (C=1.0)': LogisticRegression(
        C=1.0, max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE
    ),
    'Logistic Regression (C=0.1)': LogisticRegression(
        C=0.1, max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE
    ),
    'Logistic Regression (C=0.01)': LogisticRegression(
        C=0.01, max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE
    ),
    'Random Forest (depth=None)': RandomForestClassifier(
        n_estimators=100, max_depth=None, class_weight='balanced', 
        random_state=RANDOM_STATE, n_jobs=-1
    ),
    'Random Forest (depth=10)': RandomForestClassifier(
        n_estimators=100, max_depth=10, class_weight='balanced',
        random_state=RANDOM_STATE, n_jobs=-1
    ),
    'Random Forest (depth=5)': RandomForestClassifier(
        n_estimators=100, max_depth=5, class_weight='balanced',
        random_state=RANDOM_STATE, n_jobs=-1
    ),
    'XGBoost (depth=6)': xgb.XGBClassifier(
        n_estimators=100, max_depth=6, learning_rate=0.1,
        scale_pos_weight=(y==0).sum()/(y==1).sum(),
        random_state=RANDOM_STATE, eval_metric='logloss'
    ),
    'XGBoost (depth=3)': xgb.XGBClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.1,
        scale_pos_weight=(y==0).sum()/(y==1).sum(),
        random_state=RANDOM_STATE, eval_metric='logloss'
    ),
    'SVM (C=1.0)': SVC(
        C=1.0, kernel='rbf', class_weight='balanced',
        probability=True, random_state=RANDOM_STATE
    ),
    'SVM (C=0.1)': SVC(
        C=0.1, kernel='rbf', class_weight='balanced',
        probability=True, random_state=RANDOM_STATE
    )
}

In [7]:
# Perform cross-validation for each model
cv_results = []

for name, model in models.items():
    print(f"\nEvaluating {name}...")
    
    # Cross-validation scores
    scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    cv_results.append({
        'Model': name,
        'Mean AUROC': scores.mean(),
        'Std AUROC': scores.std(),
        'Min AUROC': scores.min(),
        'Max AUROC': scores.max(),
        'CV Scores': scores
    })
    
    print(f"  AUROC: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

cv_df = pd.DataFrame(cv_results)


Evaluating Logistic Regression (C=1.0)...


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + 

  AUROC: 0.8368 (+/- 0.0059)

Evaluating Logistic Regression (C=0.1)...


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction 

  AUROC: 0.8370 (+/- 0.0057)

Evaluating Logistic Regression (C=0.01)...


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


  AUROC: 0.8365 (+/- 0.0052)

Evaluating Random Forest (depth=None)...
  AUROC: 0.8687 (+/- 0.0043)

Evaluating Random Forest (depth=10)...
  AUROC: 0.8640 (+/- 0.0048)

Evaluating Random Forest (depth=5)...
  AUROC: 0.8477 (+/- 0.0064)

Evaluating XGBoost (depth=6)...
  AUROC: 0.8680 (+/- 0.0061)

Evaluating XGBoost (depth=3)...
  AUROC: 0.8579 (+/- 0.0072)

Evaluating SVM (C=1.0)...
  AUROC: 0.8507 (+/- 0.0050)

Evaluating SVM (C=0.1)...
  AUROC: 0.8472 (+/- 0.0039)


In [None]:
# Display results
print("\n" + "="*80)
print("CROSS-VALIDATION RESULTS")
print("="*80)
print(cv_df[['Model', 'Mean AUROC', 'Std AUROC', 'Min AUROC', 'Max AUROC']].to_string(index=False))

In [None]:
# Visualize cross-validation results
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Box plot of CV scores
cv_scores_list = [r['CV Scores'] for r in cv_results]
model_names = [r['Model'] for r in cv_results]

axes[0].boxplot(cv_scores_list, labels=range(len(model_names)))
axes[0].set_xticklabels([m.split('(')[0].strip() for m in model_names], rotation=45, ha='right')
axes[0].set_ylabel('AUROC')
axes[0].set_title('Cross-Validation Score Distribution', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Bar plot with error bars
means = [r['Mean AUROC'] for r in cv_results]
stds = [r['Std AUROC'] for r in cv_results]
x_pos = np.arange(len(model_names))

bars = axes[1].bar(x_pos, means, yerr=stds, capsize=5, alpha=0.7, color='steelblue')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels([m.split('(')[0].strip() for m in model_names], rotation=45, ha='right')
axes[1].set_ylabel('Mean AUROC')
axes[1].set_title('Mean CV Score with Standard Deviation', fontsize=12, fontweight='bold')
axes[1].set_ylim([0.75, 0.90])
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('cross_validation_results.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Learning Curves: Diagnosing Overfitting

Learning curves show how training and validation scores change with training set size:
- **High bias (underfitting)**: Both scores are low and converge
- **High variance (overfitting)**: Large gap between training and validation scores

In [None]:
def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=-1, 
                        train_sizes=np.linspace(0.1, 1.0, 10)):
    """Plot learning curve for a model"""
    
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs,
        train_sizes=train_sizes, scoring='roc_auc'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    return train_sizes, train_mean, train_std, val_mean, val_std

In [None]:
# Plot learning curves for key models
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

models_for_lc = [
    ('Logistic Regression', LogisticRegression(C=0.1, max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)),
    ('Random Forest (depth=10)', RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)),
    ('Random Forest (depth=None)', RandomForestClassifier(n_estimators=100, max_depth=None, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)),
    ('XGBoost', xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, scale_pos_weight=(y==0).sum()/(y==1).sum(), random_state=RANDOM_STATE, eval_metric='logloss'))
]

for idx, (name, model) in enumerate(models_for_lc):
    ax = axes[idx // 2, idx % 2]
    
    print(f"Computing learning curve for {name}...")
    train_sizes, train_mean, train_std, val_mean, val_std = plot_learning_curve(
        model, name, X_scaled, y, cv=5
    )
    
    # Plot
    ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    ax.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
    ax.plot(train_sizes, train_mean, 'o-', color='blue', label='Training score')
    ax.plot(train_sizes, val_mean, 'o-', color='orange', label='Validation score')
    
    ax.set_xlabel('Training Set Size')
    ax.set_ylabel('AUROC')
    ax.set_title(f'Learning Curve: {name}', fontsize=11, fontweight='bold')
    ax.legend(loc='lower right')
    ax.grid(True, alpha=0.3)
    ax.set_ylim([0.75, 1.0])
    
    # Calculate and display gap
    gap = train_mean[-1] - val_mean[-1]
    ax.text(0.05, 0.95, f'Gap: {gap:.3f}', transform=ax.transAxes, 
            fontsize=10, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('learning_curves.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Validation Curves: Finding Optimal Regularization

Validation curves show how training and validation scores change with a hyperparameter value.

In [None]:
# Validation curve for Logistic Regression (C parameter)
print("Computing validation curve for Logistic Regression (C parameter)...")

C_range = np.logspace(-3, 2, 10)

train_scores, val_scores = validation_curve(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE),
    X_scaled, y, param_name='C', param_range=C_range,
    cv=5, scoring='roc_auc', n_jobs=-1
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

In [None]:
# Validation curve for Random Forest (max_depth)
print("Computing validation curve for Random Forest (max_depth)...")

depth_range = [2, 3, 5, 7, 10, 15, 20, None]

rf_train_scores, rf_val_scores = validation_curve(
    RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1),
    X_scaled, y, param_name='max_depth', param_range=depth_range[:-1],  # Exclude None
    cv=5, scoring='roc_auc', n_jobs=-1
)

rf_train_mean = np.mean(rf_train_scores, axis=1)
rf_train_std = np.std(rf_train_scores, axis=1)
rf_val_mean = np.mean(rf_val_scores, axis=1)
rf_val_std = np.std(rf_val_scores, axis=1)

In [None]:
# Plot validation curves
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Logistic Regression
axes[0].semilogx(C_range, train_mean, 'o-', color='blue', label='Training')
axes[0].fill_between(C_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
axes[0].semilogx(C_range, val_mean, 'o-', color='orange', label='Validation')
axes[0].fill_between(C_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
axes[0].axvline(x=C_range[np.argmax(val_mean)], color='green', linestyle='--', label=f'Best C={C_range[np.argmax(val_mean)]:.3f}')
axes[0].set_xlabel('C (Regularization Parameter)')
axes[0].set_ylabel('AUROC')
axes[0].set_title('Validation Curve: Logistic Regression', fontsize=12, fontweight='bold')
axes[0].legend(loc='lower right')
axes[0].grid(True, alpha=0.3)

# Random Forest
axes[1].plot(depth_range[:-1], rf_train_mean, 'o-', color='blue', label='Training')
axes[1].fill_between(depth_range[:-1], rf_train_mean - rf_train_std, rf_train_mean + rf_train_std, alpha=0.1, color='blue')
axes[1].plot(depth_range[:-1], rf_val_mean, 'o-', color='orange', label='Validation')
axes[1].fill_between(depth_range[:-1], rf_val_mean - rf_val_std, rf_val_mean + rf_val_std, alpha=0.1, color='orange')
best_depth = depth_range[:-1][np.argmax(rf_val_mean)]
axes[1].axvline(x=best_depth, color='green', linestyle='--', label=f'Best depth={best_depth}')
axes[1].set_xlabel('Max Depth')
axes[1].set_ylabel('AUROC')
axes[1].set_title('Validation Curve: Random Forest', fontsize=12, fontweight='bold')
axes[1].legend(loc='lower right')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('validation_curves.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Early Stopping for XGBoost

Early stopping prevents overfitting by stopping training when validation performance stops improving.

In [None]:
# Split data for early stopping demonstration
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Further split training for validation
X_train_es, X_val_es, y_train_es, y_val_es = train_test_split(
    X_train, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
)

print(f"Training: {X_train_es.shape}, Validation: {X_val_es.shape}, Test: {X_test.shape}")

In [None]:
# Train XGBoost with early stopping
print("Training XGBoost with early stopping...")

xgb_es = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
    random_state=RANDOM_STATE,
    eval_metric='auc',
    early_stopping_rounds=20
)

xgb_es.fit(
    X_train_es, y_train_es,
    eval_set=[(X_train_es, y_train_es), (X_val_es, y_val_es)],
    verbose=False
)

print(f"Best iteration: {xgb_es.best_iteration}")
print(f"Best score: {xgb_es.best_score:.4f}")

In [None]:
# Plot training history
results = xgb_es.evals_result()

plt.figure(figsize=(12, 5))

epochs = len(results['validation_0']['auc'])
x_axis = range(epochs)

plt.plot(x_axis, results['validation_0']['auc'], label='Training')
plt.plot(x_axis, results['validation_1']['auc'], label='Validation')
plt.axvline(x=xgb_es.best_iteration, color='red', linestyle='--', 
            label=f'Early Stop (iter={xgb_es.best_iteration})')

plt.xlabel('Boosting Iterations')
plt.ylabel('AUROC')
plt.title('XGBoost Training with Early Stopping', fontsize=12, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('early_stopping_xgboost.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Compare Train vs Test Performance

Large gaps between training and test performance indicate overfitting.

In [None]:
# Train final models and compare train/test performance
final_models = {
    'Logistic Regression': LogisticRegression(C=0.1, max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(), random_state=RANDOM_STATE, eval_metric='logloss'),
    'SVM': SVC(C=0.1, kernel='rbf', class_weight='balanced', probability=True, random_state=RANDOM_STATE)
}

train_test_comparison = []

for name, model in final_models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Get probabilities
    if hasattr(model, 'predict_proba'):
        train_proba = model.predict_proba(X_train)[:, 1]
        test_proba = model.predict_proba(X_test)[:, 1]
    else:
        train_proba = model.decision_function(X_train)
        test_proba = model.decision_function(X_test)
    
    # Calculate metrics
    train_auroc = roc_auc_score(y_train, train_proba)
    test_auroc = roc_auc_score(y_test, test_proba)
    gap = train_auroc - test_auroc
    
    train_test_comparison.append({
        'Model': name,
        'Train AUROC': train_auroc,
        'Test AUROC': test_auroc,
        'Gap': gap,
        'Overfit Risk': 'Low' if gap < 0.02 else 'Medium' if gap < 0.05 else 'High'
    })

comparison_df = pd.DataFrame(train_test_comparison)
print("\n" + "="*80)
print("TRAIN VS TEST PERFORMANCE (Overfitting Detection)")
print("="*80)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize train/test comparison
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(comparison_df))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Train AUROC'], width, label='Train', color='steelblue', alpha=0.8)
bars2 = ax.bar(x + width/2, comparison_df['Test AUROC'], width, label='Test', color='darkorange', alpha=0.8)

# Add gap annotations
for i, (_, row) in enumerate(comparison_df.iterrows()):
    gap = row['Gap']
    color = 'green' if gap < 0.02 else 'orange' if gap < 0.05 else 'red'
    ax.annotate(f'Gap: {gap:.3f}', xy=(i, max(row['Train AUROC'], row['Test AUROC']) + 0.01),
                ha='center', fontsize=9, color=color, fontweight='bold')

ax.set_xlabel('Model')
ax.set_ylabel('AUROC')
ax.set_title('Training vs Test Performance (Overfitting Check)', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'])
ax.legend()
ax.set_ylim([0.80, 0.95])
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('train_test_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Final Model Selection with Best Practices

In [None]:
# Train optimized models with all mitigation techniques
print("\n" + "="*80)
print("FINAL MODELS WITH OVERFITTING MITIGATION")
print("="*80)

# Best configurations based on cross-validation
best_models = {
    'Logistic Regression': LogisticRegression(
        C=0.1,  # Regularization
        max_iter=1000,
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,  # Limited depth
        min_samples_split=10,  # Prevent small splits
        min_samples_leaf=5,  # Minimum leaf size
        class_weight='balanced',
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,  # Shallow trees
        learning_rate=0.1,
        reg_alpha=0.1,  # L1 regularization
        reg_lambda=1.0,  # L2 regularization
        scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
        random_state=RANDOM_STATE,
        eval_metric='logloss'
    ),
    'SVM': SVC(
        C=0.1,  # Regularization
        kernel='rbf',
        class_weight='balanced',
        probability=True,
        random_state=RANDOM_STATE
    )
}

final_results = []

for name, model in best_models.items():
    # Cross-validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    # Train on full training set, evaluate on test
    model.fit(X_train, y_train)
    
    if hasattr(model, 'predict_proba'):
        test_proba = model.predict_proba(X_test)[:, 1]
    else:
        test_proba = model.decision_function(X_test)
    
    test_auroc = roc_auc_score(y_test, test_proba)
    test_pr_auc = average_precision_score(y_test, test_proba)
    
    final_results.append({
        'Model': name,
        'CV AUROC (mean)': cv_scores.mean(),
        'CV AUROC (std)': cv_scores.std(),
        'Test AUROC': test_auroc,
        'Test PR AUC': test_pr_auc
    })
    
    print(f"\n{name}:")
    print(f"  CV AUROC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    print(f"  Test AUROC: {test_auroc:.4f}")
    print(f"  Test PR AUC: {test_pr_auc:.4f}")

final_df = pd.DataFrame(final_results)

In [None]:
# Save final results
import os
import pickle

os.makedirs('cv_mitigation_results', exist_ok=True)

# Save results
final_df.to_csv('cv_mitigation_results/final_cv_results.csv', index=False)
cv_df[['Model', 'Mean AUROC', 'Std AUROC']].to_csv('cv_mitigation_results/all_cv_results.csv', index=False)
comparison_df.to_csv('cv_mitigation_results/train_test_comparison.csv', index=False)

# Save models
for name, model in best_models.items():
    filename = f"cv_mitigation_results/{name.lower().replace(' ', '_')}_optimized.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

print("\nResults saved to 'cv_mitigation_results/' directory")

## 8. Summary

In [None]:
print("="*80)
print("CROSS-VALIDATION & MITIGATION SUMMARY")
print("="*80)

print("\n1. TECHNIQUES APPLIED:")
print("   - 5-Fold Stratified Cross-Validation")
print("   - Learning Curves (bias-variance diagnosis)")
print("   - Validation Curves (optimal hyperparameters)")
print("   - Early Stopping (XGBoost)")
print("   - Regularization (L1/L2, max_depth, C parameter)")

print("\n2. CROSS-VALIDATION RESULTS:")
for _, row in final_df.iterrows():
    print(f"   {row['Model']}: {row['CV AUROC (mean)']:.4f} (+/- {row['CV AUROC (std)']*2:.4f})")

print("\n3. OVERFITTING STATUS:")
for _, row in comparison_df.iterrows():
    print(f"   {row['Model']}: Gap = {row['Gap']:.4f} ({row['Overfit Risk']} risk)")

print("\n4. KEY FINDINGS:")
best_model = final_df.loc[final_df['CV AUROC (mean)'].idxmax()]
print(f"   - Best model: {best_model['Model']}")
print(f"   - CV AUROC: {best_model['CV AUROC (mean)']:.4f}")
print(f"   - Test AUROC: {best_model['Test AUROC']:.4f}")
print(f"   - All models show minimal overfitting with proper regularization")

print("\n" + "="*80)