In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits
digits=load_digits()

In [2]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(digits.data,digits.target,test_size=0.3)

In [23]:
lr=LogisticRegression(max_iter=158) #the number of iterations refers to the number of steps the optimization algorithm (such as gradient descent) takes to find the optimal values of the model's coefficients (weights). These coefficients are the parameters that help the model make predictions.
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9740740740740741

In [24]:
sv=SVC()
sv.fit(X_train,y_train)
sv.score(X_test,y_test)


0.9925925925925926

In [25]:
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

0.9814814814814815

In [37]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=3)
kf
#In K-Fold cross validation, we divide the dataset into k equal parts, train it on k-1 folds and then test the model on the 1 remaining fold. This process repeats k times ensuring that every data point gets a chance to be a part of the test set.
#Stratified K-Fold is a smart version of K-Fold that preserves the percentage of samples for each class in every fold. Since in the below cell we aren't using stratified kfold, the test and train set sizes are different.

KFold(n_splits=3, random_state=None, shuffle=False)

In [38]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9,10]):  #3 iterations of train and test sets
    print(train_index,test_index)

[4 5 6 7 8 9] [0 1 2 3]
[0 1 2 3 7 8 9] [4 5 6]
[0 1 2 3 4 5 6] [7 8 9]


In [39]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [59]:
from sklearn.model_selection import StratifiedKFold
folds=StratifiedKFold(n_splits=3)
scores_l=[]
scores_svm=[]
scores_rf=[]

for train_index,test_index in folds.split(digits.data,digits.target):
    X_train,X_test,y_train,y_test=digits.data[train_index],digits.data[test_index],digits.target[train_index],digits.target[test_index]
    scores_l.append(get_score(LogisticRegression(max_iter=200),X_train,X_test,y_train,y_test)) #find out what is solver=liblinear and multiclass=ovr
    scores_svm.append(get_score(SVC(),X_train,X_test,y_train,y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=60),X_train,X_test,y_train,y_test))

In [42]:
scores_l

[0.9198664440734557, 0.9415692821368948, 0.9165275459098498]

In [43]:
scores_svm

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [60]:
scores_rf

[0.9365609348914858, 0.9549248747913188, 0.9265442404006677]

In [61]:
#Many times we get in a dilemma of which machine learning model should we use for a given problem. KFold cross validation allows us to evaluate performance of a model by creating K folds of given dataset.

In [62]:
from sklearn.model_selection import cross_val_score

In [67]:
cross_val_score(LogisticRegression(max_iter=200), digits.data, digits.target,cv=3)

array([0.91986644, 0.94156928, 0.91652755])

In [68]:
cross_val_score(SVC(),digits.data,digits.target,cv=3)

array([0.96494157, 0.97996661, 0.96494157])

In [69]:
cross_val_score(RandomForestClassifier(),digits.data,digits.target,cv=3)

array([0.93656093, 0.95492487, 0.92654424])

In [74]:
scores1=cross_val_score(RandomForestClassifier(n_estimators=40),digits.data,digits.target,cv=3)
np.average(scores1)

0.9309961046188091

In [75]:
scores2=cross_val_score(RandomForestClassifier(n_estimators=50),digits.data,digits.target,cv=3)
np.average(scores2)

0.938230383973289

In [86]:
scores3=cross_val_score(RandomForestClassifier(n_estimators=60),digits.data,digits.target,cv=3)
np.average(scores3)

0.9387868670005565

In [80]:
#Like the above thing, check for which fine tuning the model works best.