# K Fold Cross validation

## 1. Import Usefull Libraries

In [49]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

## 2. Import all the models

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## 3. Load Data

In [51]:
from sklearn.datasets import load_digits
digits = load_digits()

In [52]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

## 4. Train Test Splitting of datasets

In [53]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(digits.data , digits.target , test_size=0.3)

## 5. Fit data in all model and compare score

In [54]:
def get_score(model , x_train , x_test , y_train , y_test):
    model.fit(x_train , y_train)
    return model.score(x_test , y_test)

In [55]:
get_score(LogisticRegression() , x_train , x_test , y_train , y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9666666666666667

In [56]:
get_score(SVC() , x_train , x_test , y_train , y_test)

0.9833333333333333

In [57]:
get_score(RandomForestClassifier() , x_train , x_test , y_train , y_test)

0.9740740740740741

# To compare these models , we use Cross Validation Technique here

In [58]:
from sklearn.model_selection import cross_val_score

In [59]:
cross_val_score(LogisticRegression() , digits.data , digits.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.92222222, 0.86944444, 0.94150418, 0.93871866, 0.89693593])

In [60]:
cross_val_score(SVC() , digits.data , digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [61]:
cross_val_score(RandomForestClassifier(n_estimators=40) , digits.data , digits.target)

array([0.93055556, 0.87777778, 0.95264624, 0.9637883 , 0.92200557])

# To check our model in all the parameter in loop ,  we use RandomizedSearchCV or GridSearchCV

In [67]:
from sklearn.model_selection import RandomizedSearchCV ## or GridSearchCV

clf = RandomizedSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20],
    'kernel':['rbf','linear']
    },
    cv=5,
    return_train_score=False,
    n_iter = 5)

clf.fit(digits.data , digits.target)
df1 = pd.DataFrame(clf.cv_results_)
df1[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.476366
1,10,linear,0.947697
2,20,linear,0.947697
3,10,rbf,0.476366
4,1,rbf,0.448545


# Hyper Paramater Tuning

In [63]:
model_parameter = {
    'svm': {
        'model' : SVC(gamma='auto'),
        'params': {'C':[1,10,20] , 'kernel':['rbf','linear']}
    },
    'random_forest':{
        'model': RandomForestClassifier(),
        'params': {'n_estimators' : [1,5,10]}
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {'C':[1,5,10]}
    }

}

In [66]:
from sklearn.model_selection import GridSearchCV
scores = []

for model_name , mp in model_parameter.items():
    clf = GridSearchCV(
        mp['model'],mp['params'],
        cv=5,
        return_train_score=False)
    clf.fit(digits.data , digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_paramater': clf.best_params_
    })

In [68]:
df = pd.DataFrame(scores,columns=['model','best_score','best_paramater'])
df

Unnamed: 0,model,best_score,best_paramater
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.900424,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
