In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RepeatedKFold

In [2]:
data = load_breast_cancer()
X, y = data.data, data.target

In [3]:
model = LogisticRegression(max_iter = 5000, solver = 'lbfgs')

In [4]:
model.fit(X, y)
acc_all = model.score(X, y)
print(f"[No validation] Accuracy = {acc_all:.4f}")

[No validation] Accuracy = 0.9578


In [5]:
# Holdout validation
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size = 0.2, random_state = 0
)

In [6]:
model.fit(X_tr, y_tr)
acc_holdout = model.score(X_te, y_te)
print(f"[Holdout] Accuracy = {acc_holdout:.4f}")

[Holdout] Accuracy = 0.9474


In [7]:
# KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores_kfold = cross_val_score(model, X, y, cv = kf)
print(f"[KFold] Mean = {scores_kfold.mean():.4f}, Std = {scores_kfold.std():.4f}")

[KFold] Mean = 0.9507, Std = 0.0191


In [8]:
# Iterated (Repeated) KFold

rkf = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state = 42)
scores_rkf = cross_val_score(model, X, y, cv = rkf)
print(f"[KFold] Mean = {scores_rkf.mean():.4f}, Std = {scores_rkf.std():.4f}")

[KFold] Mean = 0.9537, Std = 0.0215


In [11]:
# Linear Regression on Diabetes dataset
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

data_reg = load_diabetes()
X_reg, y_reg = data_reg.data, data_reg.target
linreg = LinearRegression()


In [12]:
# No validation
linreg.fit(X_reg, y_reg)
r2_all = linreg.score(X_reg, y_reg)
print(f"[No validation][LinearRegression] R^2 = {r2_all:.4f}")


[No validation][LinearRegression] R^2 = 0.5177


In [13]:
# Holdout validation
Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=0
)
linreg.fit(Xr_tr, yr_tr)
r2_holdout = linreg.score(Xr_te, yr_te)
print(f"[Holdout][LinearRegression] R^2 = {r2_holdout:.4f}")


[Holdout][LinearRegression] R^2 = 0.3322


In [14]:
# KFold and RepeatedKFold
kf_reg = KFold(n_splits=5, shuffle=True, random_state=42)
scores_kfold_reg = cross_val_score(linreg, X_reg, y_reg, cv=kf_reg, scoring='r2')
print(f"[KFold][LinearRegression] Mean = {scores_kfold_reg.mean():.4f}, Std = {scores_kfold_reg.std():.4f}")

rkf_reg = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
scores_rkf_reg = cross_val_score(linreg, X_reg, y_reg, cv=rkf_reg, scoring='r2')
print(f"[RepeatedKFold][LinearRegression] Mean = {scores_rkf_reg.mean():.4f}, Std = {scores_rkf_reg.std():.4f}")


[KFold][LinearRegression] Mean = 0.4785, Std = 0.0850
[RepeatedKFold][LinearRegression] Mean = 0.4800, Std = 0.0760
