In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [5]:
from sklearn.datasets import load_digits
digits = load_digits()

In [6]:
dir(digits)

['DESCR', 'data', 'images', 'target', 'target_names']

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size = 0.2)

In [20]:
# using Logistic Regression
log_reg_model = LogisticRegression(max_iter = 3500)
log_reg_model.fit(x_train, y_train)
log_reg_model.score(x_test, y_test)

0.9472222222222222

In [21]:
# using SVM
svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

0.9944444444444445

In [22]:
# using Random Forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9805555555555555

In [23]:
# using Decision Tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt.score(x_test, y_test)

0.8583333333333333

# currently random forest is performing the best
# but we cannot rely just on train_test_split because every time it will select a different set of data for traning and testing
# and so the score of our model will vary everytime
# so it might happen that for a particular training and testing set some other algorithm performs better
# that is why we need cross validation

In [26]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 3)
for train_index, test_index in kf.split([1,2,3,4,5,6]):
    print(train_index, test_index)

[2 3 4 5] [0 1]
[0 1 4 5] [2 3]
[0 1 2 3] [4 5]


# we can see that the sample data got splitted into 3 folds, and each time we got a different training and testing set
# first testing set = [1,2] and rest is training set
# second testing set = [3,4] and rest is training set
# first testing set = [5,6] and rest is training set

In [None]:
# now applying Kfold cross validation on our digits dataset
# from sklearn.model_selection import StratifiedKFold # StratifiedKFold disributes the target variable uniformly while splitting
# skf = StratifiedKFold(n_splits = 3)

In [52]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [77]:
log_reg_score = []
rf_score = []
dt_score = []
svm_score = []

for train_index, test_index in kf.split(digits.data):
    x_train, x_test, y_train, y_test = digits.data[train_index], digits.data[test_index],digits.target[train_index], digits.target[test_index]
    log_reg_score.append(get_score(LogisticRegression(max_iter = 3500), x_train, x_test, y_train, y_test))
    rf_score.append(get_score(RandomForestClassifier(n_estimators = 50), x_train, x_test, y_train, y_test))
    dt_score.append(get_score(DecisionTreeClassifier(), x_train, x_test, y_train, y_test))
    svm_score.append(get_score(SVC(), x_train, x_test, y_train, y_test))
    
print("logistic regression score: ",log_reg_score)
print("random forest score: ",rf_score)
print("decision tree score: ",dt_score)
print("support vector machine score: ",svm_score)

logistic regression score:  [0.9282136894824707, 0.9415692821368948, 0.9165275459098498]
random forest score:  [0.9198664440734557, 0.9582637729549248, 0.9198664440734557]
decision tree score:  [0.7445742904841403, 0.8196994991652755, 0.7996661101836394]
support vector machine score:  [0.9666110183639399, 0.9816360601001669, 0.9549248747913188]


# we can see that support vector machine is working best for us

In [78]:
# we can take the mean and decide best algorithm for our data
import numpy as np
print(np.mean(log_reg_score))
print(np.mean(rf_score))
print(np.mean(dt_score))
print(np.mean(svm_score))

0.9287701725097385
0.9326655537006121
0.7879799666110183
0.9677239844184752


In [81]:
# intead of defining a function get_score() and using for loop we can import cross_val_score from sklearn
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(), digits.data, digits.target, cv=3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

array([0.92153589, 0.94156928, 0.91652755])

In [82]:
cross_val_score(SVC(), digits.data, digits.target, cv=3)

array([0.96494157, 0.97996661, 0.96494157])