<h3 align="center">Codebasics ML Course: K Fold Cross Validation</h3>

We will generate a synthetic dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [2]:
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69       130
           1       0.66      0.74      0.70       120

    accuracy                           0.70       250
   macro avg       0.70      0.70      0.70       250
weighted avg       0.70      0.70      0.70       250



In [3]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split([50,60,70,80,90]):
    print(train_index, test_index)

[0 2 3 4] [1]
[0 1 2 3] [4]
[0 1 3 4] [2]
[1 2 3 4] [0]
[0 1 2 4] [3]


In [4]:
model = LogisticRegression()

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]  
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

0.675
0.715
0.72
0.645
0.72


### Cross validation on Logistic Regression

In [5]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression(),X,y,cv=5)
scores

array([0.71 , 0.69 , 0.655, 0.685, 0.7  ])

### Cross validation on Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier
scores = cross_val_score(DecisionTreeClassifier(),X,y,cv=5)
scores

array([0.825, 0.735, 0.8  , 0.805, 0.815])

### Cross validation on Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(),X,y,cv=5)

array([0.875, 0.85 , 0.9  , 0.92 , 0.895])

### Cross validation to evaluate same model with different parameters

Random Forest on 20 Trees

In [8]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=20),X,y,cv=5)
np.mean(score_rfc)

np.float64(0.8580000000000002)

Random Forest on 30 Trees

In [9]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=30),X,y,cv=5,scoring='roc_auc')
np.average(score_rfc)

np.float64(0.94712)

### Using cross_validate to evaluate multiple metrics

In [10]:
from sklearn.model_selection import cross_validate
cross_validate(RandomForestClassifier(),X,y,cv=5,scoring=['accuracy','roc_auc'])

{'fit_time': array([0.70404315, 0.72767234, 0.71660256, 0.71657658, 0.75829768]),
 'score_time': array([0.04942751, 0.04768896, 0.04792714, 0.05448508, 0.04779387]),
 'test_accuracy': array([0.88 , 0.86 , 0.9  , 0.915, 0.87 ]),
 'test_roc_auc': array([0.9411 , 0.95115, 0.96515, 0.95695, 0.9417 ])}

In [11]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [12]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=20),X,y,cv=skf)
np.mean(score_rfc)

np.float64(0.8629999999999999)

In [13]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=30),X,y,cv=skf,scoring='roc_auc')
np.average(score_rfc)

np.float64(0.9386199999999999)

In [None]:
from sklearn.model_selection import GridSearchCV

pram_grid_clf = {'criterion' : ["gini", "entropy", "log_loss"],
             'splitter' : ["best", "random"],
             'max_features' : [int, float ,'sqrt', "log2",None],
              'max_depth': list(range(50,150)),
              'class_weight' :[None, "balanced"],
              'monotonic_cst':[0,1,-1,None],
              'max_leaf_nodes':list(range(1,100))
             }

clf = GridSearchCV(
     DecisionTreeClassifier(),
     param_grid=pram_grid_clf,
     cv = skf
)

clf.fit(X,y)
clf.cv_results_