In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Generating synthetic dataset from make classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_features=10,
                           n_samples=1000,
                           n_informative=8,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=2,
                           random_state=42
                          )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Logistic Regression Classifier

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69       130
           1       0.66      0.74      0.70       120

    accuracy                           0.70       250
   macro avg       0.70      0.70      0.70       250
weighted avg       0.70      0.70      0.70       250



# K-Fold Cross Validation

# Evaluate Logistic Regression

In [27]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores_log = cross_val_score(LogisticRegression(), X, y, cv=kf)
np.average(scores_log)

0.6950000000000001

# Evaluate Decision Tree

In [29]:
from sklearn.tree import DecisionTreeClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores_dt = cross_val_score(DecisionTreeClassifier(), X, y, cv=kf)
np.average(scores_dt)

0.79

# Evaluate Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_val_score(RandomForestClassifier(), X, y, cv=kf)
np.average(scores_rf)

0.889

In [37]:
from sklearn.ensemble import RandomForestClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_val_score(RandomForestClassifier(n_estimators=45), X, y, cv=kf)
np.average(scores_rf)

0.8809999999999999

In [43]:
from sklearn.ensemble import RandomForestClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_val_score(RandomForestClassifier(n_estimators=45), X, y, cv=kf,scoring = "roc_auc")
np.average(scores_rf)

0.9512819521983593

In [None]:
# cross validate

In [39]:
from sklearn.model_selection import cross_validate

cross_validate(DecisionTreeClassifier(), X, y, cv = kf, scoring = ["accuracy","roc_auc"])

{'fit_time': array([0.02583098, 0.01780415, 0.02278733, 0.02234364, 0.01723981]),
 'score_time': array([0.00645971, 0.00702357, 0.01196671, 0.00499964, 0.00504994]),
 'test_accuracy': array([0.71 , 0.83 , 0.79 , 0.835, 0.805]),
 'test_roc_auc': array([0.7095544 , 0.82873149, 0.79171669, 0.8369391 , 0.80820955])}

# Stratified K-Fold

In [48]:
# Generating synthetic dataset from make classification with class imbalance
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_features=10,
                           n_samples=1000,
                           n_informative=8,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=2,
                           weights = [0.9, 0.1],
                           random_state=42
                          )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [50]:
from collections import Counter

Counter(y)

Counter({0: 897, 1: 103})

In [54]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [56]:
scores_rf = cross_val_score(RandomForestClassifier(n_estimators=45), X, y, cv=skf)
np.average(scores_rf)

0.9200000000000002