In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.datasets import load_breast_cancer

# Load data
data = load_breast_cancer()
X = data.data
y = data.target

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=4223134)
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Holdout Validation Accuracy:', accuracy)


Holdout Validation Accuracy: 0.9649122807017544


In [2]:

from sklearn.model_selection import KFold, cross_val_score

# Setup the K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform K-Fold CV
scores = cross_val_score(model, X, y, cv=kf)
print(f'K-Fold Cross Validation Accuracy scores for each fold: {scores}')
print(f'Mean accuracy: {scores.mean()}')


K-Fold Cross Validation Accuracy scores for each fold: [0.96491228 0.98245614 0.93859649 0.96491228 0.95575221]
Mean accuracy: 0.9613258810743673


In [3]:

from sklearn.model_selection import LeaveOneOut

# Setup Leave One Out Cross-Validation
loo = LeaveOneOut()

# Perform LOOCV
scores = cross_val_score(model, X, y, cv=loo)
print(f'LOOCV Accuracy score: {scores.mean()}')


LOOCV Accuracy score: 0.9630931458699473


In [4]:
from sklearn.model_selection import StratifiedKFold

# Setup Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Stratified K-Fold CV
scores = cross_val_score(model, X, y, cv=skf)
print(f'Stratified K-Fold Cross Validation Accuracy scores: {scores}')
print(f'Mean accuracy: {scores.mean()}')


Stratified K-Fold Cross Validation Accuracy scores: [0.97368421 0.94736842 0.95614035 0.94736842 0.95575221]
Mean accuracy: 0.9560627231796305


In [5]:
import numpy as np

# Perform Random Subsampling
random_scores = []
for _ in range(10):  # Repeat the process 10 times
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    random_scores.append(accuracy_score(y_test, y_pred))

print(f'Random Subsampling Accuracy scores: {random_scores}')
print(f'Mean accuracy: {np.mean(random_scores)}')


Random Subsampling Accuracy scores: [0.9385964912280702, 0.9649122807017544, 0.956140350877193, 0.9824561403508771, 0.9824561403508771, 0.9824561403508771, 0.9736842105263158, 0.9385964912280702, 0.9473684210526315, 0.9736842105263158]
Mean accuracy: 0.9640350877192981


In [6]:
from sklearn.utils import resample

# Perform Bootstrapping
bootstrap_scores = []
for _ in range(1000):  # Number of bootstrapping samples
    X_boot, y_boot = resample(X_train, y_train)
    model.fit(X_boot, y_boot)
    y_pred = model.predict(X_test)
    bootstrap_scores.append(accuracy_score(y_test, y_pred))

print('Bootstrapping Mean Accuracy:', np.mean(bootstrap_scores))


Bootstrapping Mean Accuracy: 0.9649298245614036


In [7]:
# Train and test the model on the same dataset
model.fit(X, y)
y_pred = model.predict(X)
resubstitution_accuracy = accuracy_score(y, y_pred)
print('Resubstitution Accuracy:', resubstitution_accuracy)


Resubstitution Accuracy: 1.0
