In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()

In [4]:
X = digits.data
Y = digits.target

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [6]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
lr.score(X_test,Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9629629629629629

In [7]:
svm = SVC()
svm.fit(X_train,Y_train)
svm.score(X_test,Y_test)

0.9907407407407407

In [8]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
rf.score(X_test,Y_test)

0.9796296296296296

In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [10]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


Compacting Code in one line for calculating score/accuracy of any algorithm

In [15]:
def get_score(model,X_train,X_test,Y_train,Y_test):
    model.fit(X_train,Y_train)
    return model.score(X_test,Y_test)

In [17]:
get_score(LogisticRegression(),X_train,X_test,Y_train,Y_test)

0.9629629629629629


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data,digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

0.8948247078464107
0.3806343906510851
0.9282136894824707
0.9532554257095158
0.41068447412353926
0.9515859766277128
0.9098497495826378
0.5125208681135225
0.9282136894824707


### **Above work can be done by using sklearn.model_selection.cross_val_score**

In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), digits.data, digits.target,cv=3)

array([0.89482471, 0.95325543, 0.90984975])

In [29]:
cross_val_score(SVC(gamma='auto'), digits.data, digits.target,cv=3)

array([0.38063439, 0.41068447, 0.51252087])

In [30]:
cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target,cv=3)

array([0.91986644, 0.95826377, 0.92821369])

### **Parameter tunning using k fold cross validation**

In [31]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data, digits.target, cv=10)
np.average(scores1)

0.8842613283674737

In [32]:
scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),digits.data, digits.target, cv=10)
np.average(scores2)

0.9343358162631905

In [33]:
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),digits.data, digits.target, cv=10)
np.average(scores3)

0.9493420235878336

In [34]:
scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=10)
np.average(scores4)

0.9460211049037864

# **Exercise**
Use iris flower dataset from sklearn library and use cross_val_score against following models to measure the performance of each. In the end figure out the model with best performance,

1. Logistic Regression
2. SVM
3. Decision Tree
4. Random Forest

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()

**Logistic Regression**

In [14]:
lr = cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), iris.data, iris.target,cv=3)
lr

array([0.96, 0.96, 0.94])

In [15]:
np.average(lr)

0.9533333333333333

**SupportVectorMachine(SVM)**

In [16]:
svm = cross_val_score(SVC(gamma='auto'), iris.data, iris.target,cv=3)
svm

array([0.98, 0.98, 0.96])

In [17]:
np.average(svm)

0.9733333333333333

**RandomForest**

In [18]:
rf = cross_val_score(RandomForestClassifier(n_estimators=40),iris.data, iris.target,cv=3)
rf

array([0.98, 0.94, 0.94])

In [19]:
np.average(rf)

0.9533333333333333

**DecisionTree**

In [20]:
dt = cross_val_score(DecisionTreeClassifier(), iris.data, iris.target,cv=3)
dt

array([0.98, 0.92, 0.98])

In [21]:
np.average(dt)

0.96

### **Overall, SupportVectorMachine (SVM) Model provides best performance : 0.9733333**