<h3 align="center">Codebasics ML Course: K Fold Cross Validation</h3>

We will generate a synthetic dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [2]:
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69       130
           1       0.66      0.74      0.70       120

    accuracy                           0.70       250
   macro avg       0.70      0.70      0.70       250
weighted avg       0.70      0.70      0.70       250



In [3]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split([50,60,70,80,90]):
    print(train_index, test_index)

[0 2 3 4] [1]
[0 1 2 3] [4]
[0 1 3 4] [2]
[1 2 3 4] [0]
[0 1 2 4] [3]


In [4]:
model = LogisticRegression()

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]  
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

0.675
0.715
0.72
0.645
0.72


### Cross validation on Logistic Regression

In [5]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression(),X,y,cv=5)
scores

array([0.71 , 0.69 , 0.655, 0.685, 0.7  ])

### Cross validation on Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier
scores = cross_val_score(DecisionTreeClassifier(),X,y,cv=5)
scores

array([0.8  , 0.715, 0.8  , 0.81 , 0.83 ])

### Cross validation on Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(),X,y,cv=5)

array([0.875, 0.86 , 0.915, 0.915, 0.885])

### Cross validation to evaluate same model with different parameters

Random Forest on 20 Trees

In [8]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=20),X,y,cv=5)
np.mean(score_rfc)

np.float64(0.867)

Random Forest on 30 Trees

In [9]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=30),X,y,cv=5,scoring='roc_auc')
np.average(score_rfc)

np.float64(0.94331)

### Using cross_validate to evaluate multiple metrics

In [10]:
from sklearn.model_selection import cross_validate
cross_validate(RandomForestClassifier(),X,y,cv=5,scoring=['accuracy','roc_auc'])

{'fit_time': array([0.69734764, 0.78602624, 0.69230223, 0.70114279, 0.71259642]),
 'score_time': array([0.05581379, 0.04716372, 0.0445435 , 0.0449121 , 0.04580569]),
 'test_accuracy': array([0.875, 0.87 , 0.925, 0.9  , 0.87 ]),
 'test_roc_auc': array([0.9429 , 0.94055, 0.96515, 0.96135, 0.9426 ])}

In [11]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [12]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=20),X,y,cv=skf)
np.mean(score_rfc)

np.float64(0.859)

In [13]:
score_rfc = cross_val_score(RandomForestClassifier(n_estimators=30),X,y,cv=skf,scoring='roc_auc')
np.average(score_rfc)

np.float64(0.94636)

In [14]:
# from sklearn.model_selection import GridSearchCV

# pram_grid_clf = {'criterion' : ["gini", "entropy", "log_loss"],
#              'splitter' : ["best", "random"],
#              'max_features' : [int, float ,'sqrt', "log2",None],
#               'max_depth': list(range(50,150)),
#               'class_weight' :[None, "balanced"],
#               'monotonic_cst':[0,1,-1,None],
#               'max_leaf_nodes':list(range(1,100))
#              }

# clf = GridSearchCV(
#      DecisionTreeClassifier(),
#      param_grid=pram_grid_clf,
#      cv = skf
# )

# clf.fit(X,y)
# clf.cv_results_

In [15]:
from sklearn.model_selection import GridSearchCV

pram_grid_clf = {'criterion' : ["gini", "entropy", "log_loss"],
             'splitter' : ["best", "random"],
              'max_depth': list(range(50,150))}

clf = GridSearchCV(
     DecisionTreeClassifier(),
     param_grid=pram_grid_clf,
     cv = skf
)

clf.fit(X,y)
clf.cv_results_

{'mean_fit_time': array([0.01949682, 0.00385742, 0.01878738, 0.00439429, 0.01928806,
        0.00418844, 0.01949677, 0.00432839, 0.01951365, 0.00432224,
        0.0197125 , 0.00425234, 0.01947608, 0.00440722, 0.01967139,
        0.0042315 , 0.01912379, 0.00423617, 0.01946244, 0.00410752,
        0.01924372, 0.00439992, 0.01995616, 0.00444169, 0.01935258,
        0.00445385, 0.02027841, 0.00437908, 0.01947656, 0.0044488 ,
        0.01902733, 0.00438423, 0.01963248, 0.00438976, 0.01925645,
        0.00408053, 0.01961293, 0.00402937, 0.01938586, 0.00437808,
        0.01909518, 0.00423594, 0.01936717, 0.00440798, 0.01967921,
        0.00512633, 0.02030959, 0.00422168, 0.01949749, 0.00436463,
        0.01958923, 0.0042954 , 0.01951828, 0.00433421, 0.01931   ,
        0.00503407, 0.02060165, 0.00436311, 0.019595  , 0.00431285,
        0.0192472 , 0.00428286, 0.01924481, 0.00435619, 0.01946521,
        0.00435238, 0.01907182, 0.00427184, 0.01952887, 0.00435596,
        0.01857677, 0.00433984,

In [16]:
import pandas as pd

df = pd.DataFrame(clf.cv_results_)
df.head()

# df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
# df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.019497,0.001134,0.00191,0.000542,gini,50,best,"{'criterion': 'gini', 'max_depth': 50, 'splitt...",0.8,0.79,0.775,0.81,0.765,0.788,0.01631,399
1,0.003857,0.000276,0.00132,0.00016,gini,50,random,"{'criterion': 'gini', 'max_depth': 50, 'splitt...",0.755,0.78,0.805,0.71,0.75,0.76,0.03178,584
2,0.018787,0.00179,0.001541,0.000216,gini,51,best,"{'criterion': 'gini', 'max_depth': 51, 'splitt...",0.81,0.815,0.785,0.815,0.775,0.8,0.016733,200
3,0.004394,8.5e-05,0.001583,4.2e-05,gini,51,random,"{'criterion': 'gini', 'max_depth': 51, 'splitt...",0.81,0.75,0.825,0.78,0.75,0.783,0.030594,440
4,0.019288,0.001148,0.001771,6.8e-05,gini,52,best,"{'criterion': 'gini', 'max_depth': 52, 'splitt...",0.795,0.8,0.79,0.815,0.765,0.793,0.01631,337


In [17]:
df = pd.DataFrame(clf.cv_results_, columns=["param_criterion", "mean_test_score", "param_splitter"])
df

Unnamed: 0,param_criterion,mean_test_score,param_splitter
0,gini,0.788,best
1,gini,0.760,random
2,gini,0.800,best
3,gini,0.783,random
4,gini,0.793,best
...,...,...,...
595,log_loss,0.776,random
596,log_loss,0.803,best
597,log_loss,0.779,random
598,log_loss,0.801,best


In [20]:
from sklearn.model_selection import RandomizedSearchCV


pram_grid_clf = {'criterion' : ["gini", "entropy", "log_loss"],
              'max_depth': [5,10,15,20,30,40,50,60,70,80,90,100]}

clf = RandomizedSearchCV( DecisionTreeClassifier(),pram_grid_clf,cv=5,return_train_score=False,n_iter=10)
clf.fit(X,y)

df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.017651,0.007533,0.001394,0.000671,10,log_loss,"{'max_depth': 10, 'criterion': 'log_loss'}",0.775,0.795,0.825,0.77,0.805,0.794,0.0201,7
1,0.027401,0.000774,0.001709,8.8e-05,40,entropy,"{'max_depth': 40, 'criterion': 'entropy'}",0.765,0.795,0.85,0.8,0.855,0.813,0.034438,1
2,0.018569,0.001179,0.001672,8.2e-05,80,gini,"{'max_depth': 80, 'criterion': 'gini'}",0.79,0.74,0.805,0.83,0.825,0.798,0.032342,6
3,0.018019,0.000292,0.001589,4.2e-05,5,log_loss,"{'max_depth': 5, 'criterion': 'log_loss'}",0.765,0.785,0.755,0.815,0.78,0.78,0.020494,10
4,0.027345,0.000966,0.001605,4.8e-05,70,log_loss,"{'max_depth': 70, 'criterion': 'log_loss'}",0.765,0.795,0.84,0.8,0.855,0.811,0.032465,3
5,0.018976,0.001088,0.001577,2.5e-05,100,gini,"{'max_depth': 100, 'criterion': 'gini'}",0.79,0.725,0.815,0.81,0.815,0.791,0.034264,9
6,0.027172,0.00155,0.001737,0.000155,90,log_loss,"{'max_depth': 90, 'criterion': 'log_loss'}",0.76,0.795,0.82,0.8,0.86,0.807,0.032802,5
7,0.030468,0.003877,0.002084,0.001026,20,log_loss,"{'max_depth': 20, 'criterion': 'log_loss'}",0.765,0.8,0.85,0.805,0.84,0.812,0.03043,2
8,0.029935,0.002308,0.001771,0.000291,15,log_loss,"{'max_depth': 15, 'criterion': 'log_loss'}",0.76,0.8,0.84,0.785,0.86,0.809,0.036387,4
9,0.027469,0.001542,0.001432,0.000396,10,entropy,"{'max_depth': 10, 'criterion': 'entropy'}",0.78,0.795,0.82,0.77,0.805,0.794,0.01772,7
