In [1]:
# ============================================
# Nested Cross-Validation with XGBoost
# + Mean / Median Feature Selection Summary
# ============================================

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier


# ============================================
# 1. Load Dataset
# ============================================

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print("======================================")
print("Dataset Loaded")
print("Samples:", X.shape[0])
print("Features:", X.shape[1])
print("======================================\n")


# ============================================
# 2. Define Outer and Inner CV
# ============================================

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

outer_results = []
outer_best_k = []

min_k = 5


# ============================================
# 3. Outer Loop (Model Evaluation)
# ============================================

for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):

    print(f"===== Outer Fold {fold_idx+1} =====")

    X_train_outer = X.iloc[train_idx]
    y_train_outer = y.iloc[train_idx]

    X_test_outer = X.iloc[test_idx]
    y_test_outer = y.iloc[test_idx]

    # ----------------------------------------
    # Step A: Train model for tree importance
    # ----------------------------------------
    model_full = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss",
        n_jobs=-1
    )

    model_full.fit(X_train_outer, y_train_outer)

    booster = model_full.get_booster()
    score_dict = booster.get_score(importance_type="gain")

    importance_values = [
        score_dict.get(col, 0) for col in X.columns
    ]

    sorted_features = pd.DataFrame({
        "feature": X.columns,
        "importance": importance_values
    }).sort_values(
        by="importance",
        ascending=False
    )["feature"].tolist()

    # ----------------------------------------
    # Step B: Inner CV to find best_k
    # ----------------------------------------
    performance_records = []

    for k in range(min_k, len(sorted_features) + 1):

        selected_features = sorted_features[:k]
        fold_scores = []

        for inner_train_idx, inner_val_idx in inner_cv.split(
            X_train_outer, y_train_outer
        ):

            X_tr = X_train_outer.iloc[inner_train_idx][selected_features]
            y_tr = y_train_outer.iloc[inner_train_idx]

            X_va = X_train_outer.iloc[inner_val_idx][selected_features]
            y_va = y_train_outer.iloc[inner_val_idx]

            model = XGBClassifier(
                n_estimators=300,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                eval_metric="logloss",
                n_jobs=-1
            )

            model.fit(X_tr, y_tr)
            preds = model.predict(X_va)

            fold_scores.append(accuracy_score(y_va, preds))

        mean_score = np.mean(fold_scores)
        performance_records.append((k, mean_score))

    performance_df = pd.DataFrame(
        performance_records,
        columns=["num_features", "cv_accuracy"]
    )

    best_k = performance_df.loc[
        performance_df["cv_accuracy"].idxmax(),
        "num_features"
    ]

    print("Best k from Inner CV:", best_k)

    outer_best_k.append(best_k)

    # ----------------------------------------
    # Step C: Retrain with best_k on full outer train
    # ----------------------------------------
    best_features = sorted_features[:best_k]

    final_model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss",
        n_jobs=-1
    )

    final_model.fit(
        X_train_outer[best_features],
        y_train_outer
    )

    outer_preds = final_model.predict(
        X_test_outer[best_features]
    )

    outer_acc = accuracy_score(y_test_outer, outer_preds)

    print("Outer Fold Accuracy:", round(outer_acc, 4), "\n")

    outer_results.append(outer_acc)


# ============================================
# 4. Final Nested CV Summary
# ============================================

print("======================================")
print("Nested CV Results")
print("======================================")

outer_results = np.array(outer_results)
outer_best_k = np.array(outer_best_k)

print("Outer Fold Accuracies:", np.round(outer_results, 4))
print("Mean Nested CV Accuracy:", round(np.mean(outer_results), 4))
print("Std Nested CV Accuracy:", round(np.std(outer_results), 4))

print("\nBest k chosen per fold:", outer_best_k.tolist())

mean_k = np.mean(outer_best_k)
median_k = np.median(outer_best_k)
std_k = np.std(outer_best_k)
min_k_val = np.min(outer_best_k)
max_k_val = np.max(outer_best_k)

print("\n--- Feature Count Statistics ---")
print("Mean best k:", round(mean_k, 2))
print("Median best k:", int(median_k))
print("Std of best k:", round(std_k, 2))
print("Min best k:", min_k_val)
print("Max best k:", max_k_val)

recommended_k = int(median_k)
print("\nRecommended number of features (Median-based):", recommended_k)

print("\nNested CV Completed Successfully")


Dataset Loaded
Samples: 569
Features: 30

===== Outer Fold 1 =====
Best k from Inner CV: 12
Outer Fold Accuracy: 0.9649 

===== Outer Fold 2 =====
Best k from Inner CV: 11
Outer Fold Accuracy: 0.9649 

===== Outer Fold 3 =====
Best k from Inner CV: 16
Outer Fold Accuracy: 0.9561 

===== Outer Fold 4 =====
Best k from Inner CV: 18
Outer Fold Accuracy: 0.9649 

===== Outer Fold 5 =====
Best k from Inner CV: 10
Outer Fold Accuracy: 0.9735 

Nested CV Results
Outer Fold Accuracies: [0.9649 0.9649 0.9561 0.9649 0.9735]
Mean Nested CV Accuracy: 0.9649
Std Nested CV Accuracy: 0.0055

Best k chosen per fold: [12, 11, 16, 18, 10]

--- Feature Count Statistics ---
Mean best k: 13.4
Median best k: 12
Std of best k: 3.07
Min best k: 10
Max best k: 18

Recommended number of features (Median-based): 12

Nested CV Completed Successfully
