## Techniques how to choose best model for our prediction
## also which parameer to use

In [5]:
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()

In [11]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [15]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["target"] = iris.target
df['target_names'] = df.target.apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


Traditional way of splitting dataset

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2)

In [26]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

In [28]:
model.score(X_test, y_test)

1.0

*Using this it keep changing score of model
we can't rely on this method*

### To tackle this problem we use K-fold technique

In [32]:
from sklearn.model_selection import cross_val_score

In [53]:
cross_val_score(SVC(kernel='linear', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [55]:
cross_val_score(SVC(kernel='poly', C=20, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

In [57]:
cross_val_score(SVC(kernel='rbf', C=30, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.93333333, 1.        ])

This method is also manual.
We can use for loop

In [60]:
import numpy as np

In [64]:
kernel = ['linear', 'poly', 'rbf']
C = [10, 20, 30, 40]
scores = {}
for kval in kernel:
    for cval in C:
        score = cross_val_score(SVC(kernel=kval, C=cval, gamma='auto'), iris.data, iris.target, cv=5)
        scores[f"score_{kval}_{cval}"]= np.average(score)

scores

{'score_linear_10': 0.9733333333333334,
 'score_linear_20': 0.9666666666666666,
 'score_linear_30': 0.96,
 'score_linear_40': 0.96,
 'score_poly_10': 0.9666666666666666,
 'score_poly_20': 0.9533333333333334,
 'score_poly_30': 0.9533333333333334,
 'score_poly_40': 0.9533333333333334,
 'score_rbf_10': 0.9800000000000001,
 'score_rbf_20': 0.9666666666666668,
 'score_rbf_30': 0.96,
 'score_rbf_40': 0.96}

These technique also have some issues like if we have more param then we have to run more for loop

## Use of grid search CV

In [75]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(SVC(), {
    'C': [10, 20, 30, 40],
    'kernel': ['linear', 'poly', 'rbf']
}, cv=5)

In [77]:
clf.fit(iris.data, iris.target)

In [81]:
clf.best_params_

{'C': 10, 'kernel': 'rbf'}

In [83]:
clf.best_score_

0.9800000000000001

In [85]:
results = pd.DataFrame(clf.cv_results_)

In [87]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004268,0.007198,0.001416,0.001353,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,3
1,0.013491,0.015944,0.001613,0.001224,10,poly,"{'C': 10, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
2,0.007195,0.010808,0.001872,0.002839,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.002505,0.005011,0.0,0.0,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
4,0.005174,0.006706,0.000813,0.001625,20,poly,"{'C': 20, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.001109,0.002219,0.0,0.0,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
6,0.004207,0.006188,0.0,0.0,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,11
7,0.0,0.0,0.0,0.0,30,poly,"{'C': 30, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
8,0.001005,1.4e-05,0.000802,0.000401,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.933333,0.966667,1.0,0.973333,0.024944,3
9,0.000789,0.000395,0.00042,0.000516,40,linear,"{'C': 40, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,11


In [89]:
results[['param_C', 'param_kernel', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score,rank_test_score
0,10,linear,0.973333,3
1,10,poly,0.966667,6
2,10,rbf,0.98,1
3,20,linear,0.966667,6
4,20,poly,0.966667,6
5,20,rbf,0.98,1
6,30,linear,0.96,11
7,30,poly,0.966667,6
8,30,rbf,0.973333,3
9,40,linear,0.96,11


All these are for best params .
# Now for choosing best model we do hyper tunning

In [92]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [103]:
model_params = {
    'SVC': {
        'model': SVC(),
        'para': {
            'kernel': ['linear', 'poly', 'rbf'],
            'C':  [10, 20, 30, 40]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'para':{
            "n_estimators": [50, 100],
            "max_depth": [None, 5, 10]
        }
    },
    'LogisticRegression' :{
        'model': LogisticRegression(max_iter=1000),
        'para': {
            "C": [0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    }
}

In [105]:
from sklearn.model_selection import GridSearchCV
score = []
for model, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['para'], cv=5)
    clf.fit(iris.data, iris.target)
    score.append({
    "model": model,
    "best_para" : clf.best_params_,
    "best_res": clf.best_score_
    })

In [107]:
best_model = pd.DataFrame(score , columns=['models', 'para', 'score'])

In [109]:
best_model

Unnamed: 0,models,para,score
0,,,
1,,,
2,,,


In [111]:
score

[{'model': 'SVC',
  'best_para': {'C': 10, 'kernel': 'rbf'},
  'best_res': 0.9800000000000001},
 {'model': 'RandomForestClassifier',
  'best_para': {'max_depth': 5, 'n_estimators': 50},
  'best_res': 0.9666666666666668},
 {'model': 'LogisticRegression',
  'best_para': {'C': 1, 'solver': 'lbfgs'},
  'best_res': 0.9733333333333334}]