In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import (
    train_test_split,
    KFold,
    StratifiedKFold,
    RepeatedStratifiedKFold,
    LeaveOneOut,
    cross_val_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [2]:
data = load_breast_cancer()

X = data.data
y = data.target  # 0 = malignant, 1 = benign

print("Dataset shape:", X.shape)
print("Class distribution:", np.bincount(y))


Dataset shape: (569, 30)
Class distribution: [212 357]


In [3]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=500, random_state=42))
])


In [4]:
# train test split - Method 1
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)

print("Train-Test Accuracy:", accuracy)


Train-Test Accuracy: 0.9824561403508771


In [5]:
# K fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

kf_scores = cross_val_score(
    model, X, y,
    cv=kf,
    scoring="accuracy"
)

print("K-Fold Accuracy:", kf_scores.mean())


K-Fold Accuracy: 0.9771464058376029


In [6]:
# stratified k fold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

skf_scores = cross_val_score(
    model, X, y,
    cv=skf,
    scoring="accuracy"
)

print("Stratified K-Fold Accuracy:", skf_scores.mean())


Stratified K-Fold Accuracy: 0.9736686849868033


In [7]:
#repeated stratified k fold

rskf = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=5,
    random_state=42
)

rskf_scores = cross_val_score(
    model, X, y,
    cv=rskf,
    scoring="accuracy"
)

print("Repeated Stratified K-Fold Accuracy:", rskf_scores.mean())


Repeated Stratified K-Fold Accuracy: 0.9778574755472753


In [8]:
# LOOCV

loo = LeaveOneOut()

loo_scores = cross_val_score(
    model, X, y,
    cv=loo,
    scoring="accuracy"
)

print("LOOCV Accuracy:", loo_scores.mean())


LOOCV Accuracy: 0.9789103690685413


In [9]:
results = {
    "Train-Test": accuracy,
    "K-Fold": kf_scores.mean(),
    "Stratified K-Fold": skf_scores.mean(),
    "Repeated Stratified K-Fold": rskf_scores.mean(),
    "LOOCV": loo_scores.mean()
}

pd.DataFrame.from_dict(results, orient="index", columns=["Accuracy"])


Unnamed: 0,Accuracy
Train-Test,0.982456
K-Fold,0.977146
Stratified K-Fold,0.973669
Repeated Stratified K-Fold,0.977857
LOOCV,0.97891
