In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 


from sklearn.datasets import load_digits 
import pandas as pd
from sklearn.model_selection import train_test_split 
import numpy as np

In [2]:
digits = load_digits()
print(dir(digits))
df = pd.DataFrame(digits.data)
df['target'] = digits.target 
df.head()

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [3]:
x = df.drop(['target'], axis='columns')
y = df.target
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2)

Logistic Regression Model 

In [4]:
model_lr = LogisticRegression(solver='liblinear', multi_class='ovr')
print(model_lr)
model_lr.fit(train_x, train_y)
print("Logistic Regression Model Score:", model_lr.score(test_x, test_y))
model_lr

LogisticRegression(multi_class='ovr', solver='liblinear')
Logistic Regression Model Score: 0.9361111111111111


Support Vector Model 

In [5]:
model_svm = SVC(C=20 , kernel='rbf' , gamma = 'scale')
print(model_svm)
model_svm.fit(train_x, train_y)
print("Support Vector Model Score:", model_svm.score(test_x, test_y))
model_svm

SVC(C=20)
Support Vector Model Score: 0.9861111111111112


Random Forest Model 

In [6]:
model_rf = RandomForestClassifier(n_estimators=40)
print(model_rf)
model_rf.fit(train_x, train_y)
print("Random Forest Model Score:", model_rf.score(test_x, test_y))
model_rf

RandomForestClassifier(n_estimators=40)
Random Forest Model Score: 0.9638888888888889


K Fold Cross 

In [7]:
from sklearn.model_selection import KFold

#kf = KFold(n_splits=3, random_state=0.2 , shuffle=True)
kf = KFold(n_splits=3)
print(kf)


KFold(n_splits=3, random_state=None, shuffle=False)


Basic Example 

In [8]:
# split(X, y=None, groups=None)
# Generate indices to split data into training and test set.


a = [1,2,3,4,5,6,7,8,9]
for train_index, test_index in kf.split(a):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


Applying KFold on digits dataset 

In [9]:
def get_score(model, xtrain, xtest, ytrain, ytest):
    model.fit(xtrain, ytrain)
    return model.score(xtest, ytest)


score_logistic = []
score_svm = []
score_rf = []

model_lr = LogisticRegression(solver='liblinear', multi_class='ovr')
model_svm = SVC(C=20 , kernel='rbf' , gamma = 'scale')
model_rf = RandomForestClassifier(n_estimators=40)

for train_index, test_index in kf.split(digits.data):
    train_x, test_x, train_y, test_y = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]

    score_logistic.append(get_score(model_lr, train_x, test_x, train_y, test_y))
    score_svm.append(get_score(model_svm, train_x, test_x, train_y, test_y))
    score_rf.append(get_score(model_rf, train_x, test_x, train_y, test_y))

print("Score of Logistic Regression Model using Kfold:",score_logistic)
print("Score of Support Vector Model using KFold",score_svm)
print("Score of Random Forest Model using Kfold",score_rf)

Score of Logistic Regression Model using Kfold: [0.8964941569282137, 0.9515859766277128, 0.9115191986644408]
Score of Support Vector Model using KFold [0.9716193656093489, 0.9799666110183639, 0.9649415692821369]
Score of Random Forest Model using Kfold [0.9248747913188647, 0.9632721202003339, 0.9332220367278798]


Using Cross Validation Score Function 
<br><br>
The cross_val_score function in sklearn evaluates a score by cross-validation. It splits the data set into multiple subsets of training and testing data, trains the model on each training subset, performs predictions on the testing subset, and outputs the prediction accuracy score for each subset. The process repeats depending on the number of cross-validations set.


cv denotes the number of splits being made in the dataset 

In [10]:
from sklearn.model_selection import cross_val_score 

Logistic Regression Model Score using cross val score 

In [12]:
x = digits.data
y=digits.target

In [25]:
print("Cross_Val Scores using Logistic Regression Model:", cross_val_score(LogisticRegression(solver='liblinear', multi_class='ovr'), x, y, cv=3))
avg_lr = np.average(cross_val_score(LogisticRegression(solver='liblinear', multi_class='ovr'), x, y, cv=3))
print("Average Score:",avg_lr)

Cross_Val Scores using Logistic Regression Model: [0.89482471 0.95325543 0.90984975]
Average Score: 0.9193099610461881


In [26]:
print("Cross_Val Score using Support Vector Model:", cross_val_score(SVC(C=20, gamma=10, kernel='rbf'), x, y, cv=3))
avg_svm = np.average(cross_val_score(SVC(C=20, gamma=10, kernel='rbf'), x, y, cv=3))
print("Average Score:",avg_svm)

Cross_Val Score using Support Vector Model: [0.10016694 0.10183639 0.10183639]
Average Score: 0.10127991096271564


In [27]:
print("Cross_Val Score using Random Forest Classifier Model:", cross_val_score(RandomForestClassifier(n_estimators=40), x, y, cv=3))
avg_rf = np.average(cross_val_score(RandomForestClassifier(n_estimators=40), x, y, cv=3))
print("Average Score:", avg_rf)

Cross_Val Score using Random Forest Classifier Model: [0.91986644 0.94991653 0.92821369]
Average Score: 0.9332220367278797
