In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    weights = [0.9, 0.1],
    random_state=42
)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
from collections import Counter

Counter(y)

Counter({0: 897, 1: 103})

In [19]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5, shuffle= True, random_state = 42)
for train_index, test_index in kf.split([50, 40, 30, 70, 60]):
    print(train_index, test_index)

[0 2 3 4] [1]
[0 1 2 3] [4]
[0 1 3 4] [2]
[1 2 3 4] [0]
[0 1 2 4] [3]


In [5]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(Counter(y_test)) 

Counter({0: 177, 1: 23})
Counter({0: 179, 1: 21})
Counter({0: 183, 1: 17})
Counter({0: 181, 1: 19})
Counter({0: 177, 1: 23})


In [7]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(Counter(y_test))   

Counter({0: 180, 1: 20})
Counter({0: 180, 1: 20})
Counter({0: 179, 1: 21})
Counter({0: 179, 1: 21})
Counter({0: 179, 1: 21})


In [23]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(), X, y, cv=skf, scoring="accuracy")

array([0.915, 0.91 , 0.895, 0.895, 0.895])

In [25]:
from sklearn.tree import DecisionTreeClassifier
cross_val_score(DecisionTreeClassifier(), X, y, cv=skf, scoring="accuracy")

array([0.895, 0.915, 0.9  , 0.85 , 0.92 ])

In [13]:
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(n_estimators=10), X, y, cv=skf, scoring="accuracy")

array([0.92 , 0.925, 0.915, 0.91 , 0.915])

In [15]:
cross_val_score(RandomForestClassifier(n_estimators=10), X, y, cv=5, scoring="accuracy")

array([0.925, 0.895, 0.92 , 0.93 , 0.93 ])