In [1]:
# Load dataset
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [2]:
# Adding target column to dataset
df['target'] = digits.target
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [3]:
# Train Test Split
from sklearn.model_selection import train_test_split
X = df.drop(['target'], axis=1)
Y = df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
# Importing classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Checking accuracy on each model individually

In [24]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, Y_train)
lr.score(X_test, Y_test)

0.9165275459098498

In [7]:
svm = SVC()
svm.fit(X_train, Y_train)
svm.score(X_test, Y_test)

0.9888888888888889

In [8]:
tree = DecisionTreeClassifier()
tree.fit(X_train, Y_train)
tree.score(X_test, Y_test)

0.8472222222222222

In [9]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)

0.9638888888888889

# Applying KFold Cross Validation

In [10]:
def get_score(model, X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    return model.score(X_test, Y_test)

In [19]:
# Applying KFold using sklearn StratifiedKFold
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_lr = []
scores_svm = []
scores_tree = []
scores_rf = []

for train_index, test_index in folds.split(digits.data, digits.target):
  X_train, X_test, Y_train, Y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]

  scores_lr.append(get_score(LogisticRegression(max_iter=1000), X_train, X_test, Y_train, Y_test))
  scores_svm.append(get_score(SVC(), X_train, X_test, Y_train, Y_test))
  scores_tree.append(get_score(DecisionTreeClassifier(), X_train, X_test, Y_train, Y_test))
  scores_rf.append(get_score(RandomForestClassifier(), X_train, X_test, Y_train, Y_test))

In [20]:
scores_lr

[0.9198664440734557, 0.9415692821368948, 0.9165275459098498]

In [21]:
scores_svm

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [22]:
scores_tree

[0.7412353923205343, 0.8113522537562604, 0.7646076794657763]

In [23]:
scores_rf

[0.9398998330550918, 0.9515859766277128, 0.9198664440734557]

# Using Sklearn cross_val_score function

In [18]:
# Using cross_val_score function
from sklearn.model_selection import cross_val_score

print("LR :- ", cross_val_score(LogisticRegression(max_iter=1000), digits.data, digits.target, cv=3))
print("SVM :- ", cross_val_score(SVC(), digits.data, digits.target, cv=3))
print("DT :- ", cross_val_score(DecisionTreeClassifier(), digits.data, digits.target, cv=3))
print("RF :- ", cross_val_score(RandomForestClassifier(), digits.data, digits.target, cv=3))

LR :-  [0.91986644 0.94156928 0.91652755]
SVM :-  [0.96494157 0.97996661 0.96494157]
DT :-  [0.74123539 0.81135225 0.77796327]
RF :-  [0.94323873 0.95325543 0.91986644]


# Parameter tunning using k fold cross validation

In [27]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data, digits.target, cv=10)
np.average(scores1)

np.float64(0.8725605214152699)

In [28]:
scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),digits.data, digits.target, cv=10)
np.average(scores2)

np.float64(0.9376660459342023)

In [29]:
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),digits.data, digits.target, cv=10)
np.average(scores3)

np.float64(0.9415580384854128)

In [30]:
scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=10)
np.average(scores4)

np.float64(0.9393513345747981)

# Applying cross_val_score on Iris datset


In [32]:
# Load dataset
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [33]:
# Train Test Split
from sklearn.model_selection import train_test_split
X = df.drop(['target'], axis=1)
Y = df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [34]:
# Using cross_val_score function
from sklearn.model_selection import cross_val_score

print("LR :- ", cross_val_score(LogisticRegression(max_iter=1000), X_train, Y_train, cv=3))
print("SVM :- ", cross_val_score(SVC(), X_train, Y_train, cv=3))
print("DT :- ", cross_val_score(DecisionTreeClassifier(), X_train, Y_train, cv=3))
print("RF :- ", cross_val_score(RandomForestClassifier(), X_train, Y_train, cv=3))

LR :-  [0.95  0.925 0.975]
SVM :-  [0.95  0.925 0.975]
DT :-  [0.95  0.925 0.975]
RF :-  [0.95  0.925 0.975]
