# **1. Import Modul**

In [1]:
import pickle
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, roc_curve
)
from sklearn.model_selection import (
    train_test_split,
    RepeatedKFold,
    StratifiedKFold,
    cross_val_score,
    KFold
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# **2. Load Dataset**

In [2]:
df = pd.read_csv("breast-cancer.csv")
X = df.drop(columns=['id', 'diagnosis'])
y = df['diagnosis'].map({'M': 1, 'B': 0})

# **3. Pre-procesing**

In [3]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
num_pipeline = Pipeline(steps=[
    ("skew_corrector", PowerTransformer(method="yeo-johnson", standardize=False)),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False 
)

# **4. Train Model**

In [4]:
models = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [5]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    result = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "confusion": confusion_matrix(y_test, y_pred)
    }

    if y_proba is not None and len(np.unique(y_test)) == 2:
        result["auc"] = roc_auc_score(y_test, y_proba)
        result["roc"] = roc_curve(y_test, y_proba)
    else:
        result["auc"] = None
        result["roc"] = (None, None, None)

    return result


In [6]:
def train_with_validation(X, y, preprocessor, model, model_name,
                          n_repeats=5, k_folds=5):

    print(f"\n==================== {model_name} ====================")

    # ------------------------------------------------------
    # Pipeline: Preprocess → RFE → Model
    # ------------------------------------------------------
    rfe = RFE(estimator=model, n_features_to_select=None)

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("rfe", rfe),
        ("model", model)
    ])

    # ------------------------------------------------------
    # REPEATED HOLDOUT
    # ------------------------------------------------------
    rh_metrics = {
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1": [],
        "auc": [],
        "confusion": [],
        "roc": []
    }

    for i in range(n_repeats):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=i
        )

        pipe.fit(X_train, y_train)
        metrics = evaluate_model(pipe, X_test, y_test)
        for k in rh_metrics:
            rh_metrics[k].append(metrics[k])

    rh_mean = {}
    for m in ["accuracy", "precision", "recall", "f1", "auc"]:
        vals = [v for v in rh_metrics[m] if v is not None]
        rh_mean[m] = np.mean(vals) if len(vals) > 0 else None
    
    rh_extra ={
        "confusion": rh_metrics["confusion"][0],
        "roc": rh_metrics["roc"][0]
    }

    transformed_cols = pipe.named_steps["preprocessor"].get_feature_names_out()
    boolean_mask = pipe.named_steps["rfe"].support_
    selected_features = transformed_cols[boolean_mask].tolist()

    print("\nSelected Features (after preprocessing):")
    print(selected_features)

    # Save RFE result
    with open(f"rfe_{model_name}.pkl", "wb") as f:
        pickle.dump({
            "selected_features": selected_features,
            "mask": boolean_mask
        }, f)

    # ------------------------------------------------------
    # K-FOLD CV
    # ------------------------------------------------------
    print("\n--- K-Fold CV ---")

    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    kf_metrics = {
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1": [],
        "auc": [],
        "confusion": [],
        "roc": []
    }

    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pipe.fit(X_train, y_train)
        metrics = evaluate_model(pipe, X_test, y_test)

        for k in kf_metrics:
            kf_metrics[k].append(metrics[k])

    kf_mean = {}
    for m in ["accuracy", "precision", "recall", "f1", "auc"]:
        vals = [v for v in kf_metrics[m] if v is not None]
        kf_mean[m] = np.mean(vals) if len(vals) > 0 else None
    kf_extra = {
        "confusion": kf_metrics["confusion"][0],
        "roc": kf_metrics["roc"][0]
    }

    print("K-Fold Mean:", kf_mean)

    # ------------------------------------------------------
    # SAVE MODEL + METRICS
    # ------------------------------------------------------
    with open(f"model_{model_name}.pkl", "wb") as f:
        pickle.dump({
            "model_pipeline": pipe,
            "rfe_features": selected_features,
            "repeated_holdout": rh_mean,
            "repeated_holdout_extra": rh_extra,
            "kfold_mean": kf_mean,
            "kfold_extra": kf_extra
        }, f)

    print(f"Model saved as model_{model_name}.pkl")
    return selected_features, rh_mean, kf_mean

In [7]:
for name, model in models.items():
    train_with_validation(X, y, preprocessor, model, name)



Selected Features (after preprocessing):
['texture_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'compactness_se', 'symmetry_se', 'radius_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'concavity_worst', 'symmetry_worst']

--- K-Fold CV ---
K-Fold Mean: {'accuracy': np.float64(0.9718987734823784), 'precision': np.float64(0.9720155038759689), 'recall': np.float64(0.9529346622369879), 'f1': np.float64(0.9618738106200727), 'auc': np.float64(0.9968335854922818)}
Model saved as model_LogisticRegression.pkl


Selected Features (after preprocessing):
['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst']

--- K-Fold CV ---
K-Fold Mean: {'accuracy': np.float64(0.95960254618848), 'precision': np.float64(0.95414090328194

# **5. Evaluasi Model**

In [8]:
for model_name in models:
    try:
        with open(f"model_{model_name}.pkl", "rb") as f:
            pack = pickle.load(f)
    except FileNotFoundError:
        print(f"File model_{model_name}.pkl tidak ditemukan!")
        continue

    print(f"\n==================== Model: {model_name} ====================")

    # Ambil metrik
    rh_mean = pack["repeated_holdout"]
    kf_mean = pack["kfold_mean"]

    # Buat DataFrame untuk tampilkan rapi
    df_metrics = pd.DataFrame([rh_mean, kf_mean], index=["Repeated Holdout", "K-Fold"])
    print("\nMatriks Evaluasi (Accuracy, Precision, Recall, F1, AUC):")
    display(df_metrics)



Matriks Evaluasi (Accuracy, Precision, Recall, F1, AUC):


Unnamed: 0,accuracy,precision,recall,f1,auc
Repeated Holdout,0.968421,0.975641,0.938095,0.955938,0.995106
K-Fold,0.971899,0.972016,0.952935,0.961874,0.996834




Matriks Evaluasi (Accuracy, Precision, Recall, F1, AUC):


Unnamed: 0,accuracy,precision,recall,f1,auc
Repeated Holdout,0.952632,0.942729,0.928571,0.934952,0.986078
K-Fold,0.959603,0.954141,0.93887,0.945077,0.988931




Matriks Evaluasi (Accuracy, Precision, Recall, F1, AUC):


Unnamed: 0,accuracy,precision,recall,f1,auc
Repeated Holdout,0.959649,0.947432,0.942857,0.944792,0.991071
K-Fold,0.952554,0.937319,0.938649,0.936403,0.991385


In [9]:
PLOTLY_TEMPLATE = "plotly_white"

for model_name in models:
    try:
        with open(f"model_{model_name}.pkl", "rb") as f:
            pack = pickle.load(f)
    except FileNotFoundError:
        print(f"File model_{model_name}.pkl tidak ditemukan!")
        continue

    print(f"\n==================== Model: {model_name} ====================")

    # Ambil confusion matrix & ROC dari first K-Fold
    metrics = {
        "confusion": pack["kfold_extra"]["confusion"],
        "roc": pack["kfold_extra"]["roc"]
    }

    # -----------------------------
    # Confusion Matrix
    # -----------------------------
    cm = metrics["confusion"]
    cm_fig = go.Figure(data=go.Heatmap(
        z=cm,
        x=["Pred 0", "Pred 1"],
        y=["True 0", "True 1"],
        colorscale="Blues",
        texttemplate="%{z}",
        textfont=dict(size=20)
    ))
    cm_fig.update_layout(
        template=PLOTLY_TEMPLATE,
        width=400,
        height=400,
        xaxis=dict(
            scaleanchor="y",
            side="top"
        ),
        yaxis=dict(
            scaleanchor="x",
            autorange='reversed'
        ),
        margin=dict(l=50, r=50, t=50, b=50),
        title="Confusion Matrix"
    )
    cm_fig.show()

    # -----------------------------
    # ROC Curve
    # -----------------------------
    if metrics.get("roc") is not None and metrics["roc"][0] is not None:
        fpr, tpr, _ = metrics["roc"]
        roc_fig = go.Figure()
        roc_fig.add_trace(
            go.Scatter(
                x=fpr, y=tpr,
                mode='lines',
                name='ROC Curve',
                line=dict(color='#1091b9', width=2)
            )
        )
        # Random Guess line
        roc_fig.add_trace(
            go.Scatter(
                x=[0, 1],
                y=[0, 1],
                mode='lines',
                name='Random Guess',
                line=dict(color='gray', width=2, dash='dash')
            )
        )
        roc_fig.update_layout(
            title="ROC Curve",
            xaxis_title="False Positive Rate",
            yaxis_title="True Positive Rate",
            template=PLOTLY_TEMPLATE,
            width=600,
            height=500
        )
        roc_fig.show()









