In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter

In [43]:
X, y = make_classification(
    n_features=10,
    n_samples=1000,
    n_informative=4,
    n_redundant=6,
    n_repeated=0,
    n_classes=2,
    weights=[0.9, 0.1],
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [44]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94       221
           1       0.67      0.07      0.12        29

    accuracy                           0.89       250
   macro avg       0.78      0.53      0.53       250
weighted avg       0.86      0.89      0.85       250



In [45]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)

scores = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LogisticRegression()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
    print(Counter(y_test))

    # y_pred = model.predict(X_test)
    #
    # print(classification_report(y_test, y_pred))

scores
np.average(scores)

Counter({np.int64(0): 85, np.int64(1): 15})
Counter({np.int64(0): 89, np.int64(1): 11})
Counter({np.int64(0): 92, np.int64(1): 8})
Counter({np.int64(0): 89, np.int64(1): 11})
Counter({np.int64(0): 90, np.int64(1): 10})
Counter({np.int64(0): 89, np.int64(1): 11})
Counter({np.int64(0): 92, np.int64(1): 8})
Counter({np.int64(0): 90, np.int64(1): 10})
Counter({np.int64(0): 87, np.int64(1): 13})
Counter({np.int64(0): 89, np.int64(1): 11})


np.float64(0.901)

In [46]:
from sklearn.model_selection import cross_val_score

np.average(cross_val_score(LogisticRegression(), X, y, cv=kf))

np.float64(0.901)

In [36]:
from sklearn.tree import DecisionTreeClassifier

np.average(cross_val_score(DecisionTreeClassifier(), X, y, cv=kf))

np.float64(0.8800000000000001)

In [47]:
from sklearn.ensemble import RandomForestClassifier

np.average(cross_val_score(RandomForestClassifier(), X, y, cv=kf))

np.float64(0.9549999999999998)

In [40]:
from sklearn.model_selection import cross_validate

cross_validate(
    RandomForestClassifier(n_estimators=20),
    X,
    y,
    cv=kf,
    scoring=['accuracy', 'roc_auc']
)

{'fit_time': array([0.03125286, 0.02243876, 0.01956677, 0.01865411, 0.01935911,
        0.01901412, 0.01878691, 0.0193038 , 0.01897192, 0.01866198]),
 'score_time': array([0.00213242, 0.001616  , 0.00145102, 0.001405  , 0.00148773,
        0.00158477, 0.00148821, 0.00166011, 0.00145102, 0.00146294]),
 'test_accuracy': array([0.91, 0.89, 0.93, 0.88, 0.94, 0.96, 0.9 , 0.94, 0.94, 0.91]),
 'test_roc_auc': array([0.96135266, 0.9348    , 0.9819928 , 0.93186455, 0.98319328,
        0.984     , 0.97354167, 0.98454436, 0.96651786, 0.95118047])}