In [2]:
# .........First we compare hyperparameters for different models like svm,random forest etc......
# 1.SVM
# https://towardsdatascience.com/hyperparameter-tuning-for-support-vector-machines-c-and-gamma-parameters-6a5097416167
# Refer above link for SVM hyperparameters ☝
import pandas as pd
from sklearn.datasets import load_iris 
iris = load_iris()

In [3]:
df = pd.DataFrame(iris.data,columns = [iris.feature_names])

In [4]:
df1 = pd.DataFrame(iris.target,columns = ['flower'])

In [5]:
data = pd.concat([df,df1],axis= 'columns')

In [6]:
X = data.drop(columns = ['flower'])
y = data.flower

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split,cross_val_score


In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [9]:
model = SVC(gamma= 'auto',C = 30 ,kernel='rbf')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9333333333333333

In [10]:
cross_val_score(SVC(gamma= 'auto',C = 10 ,kernel='rbf'),X,y,cv = 5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [11]:
cross_val_score(SVC(gamma= 'auto',C = 20 ,kernel='rbf'),X,y,cv = 5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [12]:
cross_val_score(SVC(gamma= 'auto',C = 30 ,kernel='rbf'),X,y,cv = 5)

array([0.96666667, 1.        , 0.9       , 0.93333333, 1.        ])

In [13]:
# To compare the hyperparameters in classifier we can write this hard code
import numpy as np
kernels = ['rbf','linear']
c  = [1,10,20]
for kval in kernels:
    for cval in c:
        score = cross_val_score(SVC(gamma= 'auto',C = cval ,kernel = kval),X,y,cv = 5)
        avg_score = print(' kernal : {} ,C : {},score {} '.format(kval , str(cval),np.average(score)))
avg_score       
# avg_score gives avg accuracy when we use respective kernel and C value
# from following result (rbf,1); (rbf,10); (linear,1) this combinations having more accuracy

 kernal : rbf ,C : 1,score 0.9800000000000001 
 kernal : rbf ,C : 10,score 0.9800000000000001 
 kernal : rbf ,C : 20,score 0.9666666666666668 
 kernal : linear ,C : 1,score 0.9800000000000001 
 kernal : linear ,C : 10,score 0.9733333333333334 
 kernal : linear ,C : 20,score 0.9666666666666666 


In [14]:
# for alternate to this hard code sklearn has one search method calles GridSearchCV(model,param_grid,cv)
clf = GridSearchCV(SVC(gamma='auto'),{'C':[1,10,20],'kernel':['rbf','linear']},cv = 5,return_train_score=False)
clf.fit(X,y)
clf.cv_results_

{'mean_fit_time': array([0.00240769, 0.00118723, 0.00139594, 0.00100441, 0.0009973 ,
        0.0015965 ]),
 'std_fit_time': array([1.01767048e-03, 4.04373812e-04, 4.91566220e-04, 1.93206744e-05,
        6.88365266e-06, 5.00075417e-04]),
 'mean_score_time': array([0.00135832, 0.00101357, 0.00080733, 0.00099931, 0.00100894,
        0.00080099]),
 'std_score_time': array([4.91771328e-04, 1.33849473e-05, 4.03670224e-04, 1.77692073e-05,
        9.23194175e-06, 4.00523730e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'ker

In [15]:
# Now we convert this messy data into dataframe
hypm_svm = pd.DataFrame(clf.cv_results_)
hypm_svm

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002408,0.001018,0.001358,0.000492,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001187,0.000404,0.001014,1.3e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001396,0.000492,0.000807,0.000404,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001004,1.9e-05,0.000999,1.8e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000997,7e-06,0.001009,9e-06,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.001596,0.0005,0.000801,0.000401,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [16]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [17]:
# .........Random Forest.......
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Refer above link for hyperparameters ☝
from sklearn.ensemble import RandomForestClassifier
# first lets write hard code
 
estimator = [1,10,15,20,30,40]
for estval in estimator:
    clf = RandomForestClassifier(n_estimators = estval)
    clf.fit(X_train,y_train)
    score = clf.score(X_test,y_test)
    print('Accuracy for estimator {} is {}'.format(estval,score))
    
    
 

Accuracy for estimator 1 is 0.9333333333333333
Accuracy for estimator 10 is 0.9333333333333333
Accuracy for estimator 15 is 0.9333333333333333
Accuracy for estimator 20 is 0.9333333333333333
Accuracy for estimator 30 is 0.9333333333333333
Accuracy for estimator 40 is 0.9333333333333333


In [18]:
# Now using GridSearchCV
clf1 = GridSearchCV(RandomForestClassifier(),{'n_estimators':[1,10,15,20,30,40]},cv = 5,return_train_score=False)
clf1.fit(X_train,y_train)
clf1.cv_results_

{'mean_fit_time': array([0.00319991, 0.0105011 , 0.01354074, 0.01659031, 0.02341242,
        0.03179507]),
 'std_fit_time': array([0.001166  , 0.00210784, 0.0012632 , 0.00136128, 0.00139961,
        0.00072563]),
 'mean_score_time': array([0.00179987, 0.00220809, 0.00200777, 0.00261369, 0.00301127,
        0.00321255]),
 'std_score_time': array([7.47831202e-04, 4.17491576e-04, 6.41947599e-04, 5.00544816e-04,
        1.61149797e-05, 4.18270281e-04]),
 'param_n_estimators': masked_array(data=[1, 10, 15, 20, 30, 40],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 1},
  {'n_estimators': 10},
  {'n_estimators': 15},
  {'n_estimators': 20},
  {'n_estimators': 30},
  {'n_estimators': 40}],
 'split0_test_score': array([0.95833333, 0.95833333, 0.95833333, 0.95833333, 0.95833333,
        0.95833333]),
 'split1_test_score': array([0.75      , 0.95833333, 1.        , 0.95833333, 0.95833333,
        0.9

In [19]:
hypm_rf = pd.DataFrame(clf1.cv_results_)
hypm_rf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0032,0.001166,0.0018,0.000748,1,{'n_estimators': 1},0.958333,0.75,0.875,1.0,1.0,0.916667,0.095015,6
1,0.010501,0.002108,0.002208,0.000417,10,{'n_estimators': 10},0.958333,0.958333,0.875,0.958333,1.0,0.95,0.040825,5
2,0.013541,0.001263,0.002008,0.000642,15,{'n_estimators': 15},0.958333,1.0,0.916667,0.958333,1.0,0.966667,0.03118,1
3,0.01659,0.001361,0.002614,0.000501,20,{'n_estimators': 20},0.958333,0.958333,0.916667,0.958333,1.0,0.958333,0.026352,2
4,0.023412,0.0014,0.003011,1.6e-05,30,{'n_estimators': 30},0.958333,0.958333,0.916667,0.958333,1.0,0.958333,0.026352,2
5,0.031795,0.000726,0.003213,0.000418,40,{'n_estimators': 40},0.958333,0.958333,0.916667,0.958333,1.0,0.958333,0.026352,2


In [20]:
clf1.best_params_

{'n_estimators': 15}

In [21]:
clf1.best_score_

0.9666666666666668

In [22]:
# Randomized cv randomly iterate any combinations 
from sklearn.model_selection import RandomizedSearchCV
clf2 = RandomizedSearchCV(RandomForestClassifier(),{'n_estimators':[1,10,15,20,30,40]},n_iter=3,cv = 5,return_train_score=False)
clf2.fit(X_train,y_train)
clf2.cv_results_

{'mean_fit_time': array([0.00340881, 0.03328481, 0.01253719]),
 'std_fit_time': array([0.00121175, 0.0015843 , 0.00071795]),
 'mean_score_time': array([0.00159316, 0.00377698, 0.00260439]),
 'std_score_time': array([0.00048435, 0.0011541 , 0.00049993]),
 'param_n_estimators': masked_array(data=[1, 40, 15],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 1}, {'n_estimators': 40}, {'n_estimators': 15}],
 'split0_test_score': array([0.95833333, 0.95833333, 0.95833333]),
 'split1_test_score': array([0.95833333, 0.95833333, 0.95833333]),
 'split2_test_score': array([1.        , 0.91666667, 0.91666667]),
 'split3_test_score': array([0.91666667, 0.95833333, 0.95833333]),
 'split4_test_score': array([1., 1., 1.]),
 'mean_test_score': array([0.96666667, 0.95833333, 0.95833333]),
 'std_test_score': array([0.03118048, 0.02635231, 0.02635231]),
 'rank_test_score': array([1, 2, 2])}

In [23]:
hypm_rf2 = pd.DataFrame(clf2.cv_results_)
hypm_rf2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003409,0.001212,0.001593,0.000484,1,{'n_estimators': 1},0.958333,0.958333,1.0,0.916667,1.0,0.966667,0.03118,1
1,0.033285,0.001584,0.003777,0.001154,40,{'n_estimators': 40},0.958333,0.958333,0.916667,0.958333,1.0,0.958333,0.026352,2
2,0.012537,0.000718,0.002604,0.0005,15,{'n_estimators': 15},0.958333,0.958333,0.916667,0.958333,1.0,0.958333,0.026352,2


In [24]:
# Now we compare diff classifiers 

In [25]:
model_params = {
    'svm':{
        'clf':SVC(gamma='auto'),
        'param':{
            'C':[1,5,10,15],
            'kernel':['linear','rbf']
        }
    },
    
    'Logistic_regression':{
        'clf':LogisticRegression(),
        'param':{
            'C':[1,5,10,15]
        }
    },
    
    'Random_forest':{
        'clf':RandomForestClassifier(),
        'param':{
            'n_estimators':[1,5,10,15]
        }
    }
    
}

In [26]:
score = []

for model_name,mp in model_params.items():
    model = GridSearchCV(mp['clf'],mp['param'],cv=5,return_train_score=False)
    model.fit(X,y)
    score.append({
        'model':model_name,
        'best_score':model.best_score_,
        'best_param':model.best_params_
    })
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
df3 = pd.DataFrame(score)
df3

Unnamed: 0,model,best_score,best_param
0,svm,0.98,"{'C': 1, 'kernel': 'linear'}"
1,Logistic_regression,0.98,{'C': 10}
2,Random_forest,0.966667,{'n_estimators': 5}


In [28]:
score

[{'model': 'svm',
  'best_score': 0.9800000000000001,
  'best_param': {'C': 1, 'kernel': 'linear'}},
 {'model': 'Logistic_regression',
  'best_score': 0.9800000000000001,
  'best_param': {'C': 10}},
 {'model': 'Random_forest',
  'best_score': 0.9666666666666668,
  'best_param': {'n_estimators': 5}}]