In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
import pandas as pd
digits = load_digits()

In [19]:
len(digits.data)

1797

In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size = 0.25)

In [34]:
len(x_train)

1347

In [36]:
models = [LogisticRegression(max_iter = 1800),SVC(),RandomForestClassifier(n_estimators=40)]
for model in models:
  model.fit(x_train,y_train)
  print(model," ",model.score(x_test,y_test)*100)

LogisticRegression(max_iter=1800)   95.77777777777777
SVC()   97.77777777777777
RandomForestClassifier(n_estimators=40)   95.55555555555556


Everytime the data is splitted into train and test set the accuracy will change and it becomes difficult/time consuming to identify the algorithm which performs the best on the dataset provided.

In [39]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [40]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
  print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [41]:
def get_score(model,x_train,x_test,y_train,y_test):
  model.fit(x_train,y_train)
  return model.score(x_test,y_test)

In [43]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits = 3)

StratifiedKFold divides each classes into the folds uniformly. Because if your one fold contains only one class and other fold contains the two class and etc
then it might create problem as the distribution is non uniform.

In [51]:
scores_lr = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data,digits.target):
  x_train,x_test,y_train,y_test = digits.data[train_index],digits.data[test_index],digits.target[train_index],digits.target[test_index]
  scores_lr.append(get_score(LogisticRegression(max_iter=5000),x_train,x_test,y_train,y_test))
  scores_svm.append(get_score(SVC(),x_train,x_test,y_train,y_test))
  scores_rf.append(get_score(RandomForestClassifier(n_estimators=40),x_train,x_test,y_train,y_test))


In [52]:
scores_lr

[0.9248747913188647, 0.9415692821368948, 0.9232053422370617]

In [53]:
scores_svm

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [54]:
scores_rf

[0.9382303839732888, 0.9515859766277128, 0.9298831385642737]

In [55]:
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(max_iter=5000),digits.data,digits.target)

array([0.925     , 0.87777778, 0.93871866, 0.93314763, 0.89693593])

In [56]:
cross_val_score(SVC(),digits.data,digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [57]:
cross_val_score(RandomForestClassifier(n_estimators=40),digits.data,digits.target)

array([0.93333333, 0.90555556, 0.96100279, 0.95543175, 0.92200557])

So far we used the same parameters and different algos to find the best classifier.

Now we can also use cross_val_score method to find the best classifier with a fixed algo and different values of parameters.

In [58]:
cross_val_score(RandomForestClassifier(n_estimators=10),digits.data,digits.target)

array([0.91111111, 0.84722222, 0.93036212, 0.93593315, 0.88300836])

In [59]:
cross_val_score(RandomForestClassifier(n_estimators=20),digits.data,digits.target)

array([0.91111111, 0.85833333, 0.94428969, 0.94428969, 0.9275766 ])

In [60]:
cross_val_score(RandomForestClassifier(n_estimators=30),digits.data,digits.target)

array([0.93333333, 0.89722222, 0.96935933, 0.95821727, 0.9275766 ])

In [61]:
cross_val_score(RandomForestClassifier(n_estimators=40),digits.data,digits.target)

array([0.91944444, 0.90555556, 0.93871866, 0.95543175, 0.92479109])

In [62]:
cross_val_score(RandomForestClassifier(n_estimators=50),digits.data,digits.target)

array([0.91666667, 0.89722222, 0.95543175, 0.9637883 , 0.93036212])

In [63]:
cross_val_score(RandomForestClassifier(n_estimators=60),digits.data,digits.target)

array([0.92222222, 0.89722222, 0.96100279, 0.96100279, 0.93314763])