In [19]:
import pandas as pd
from sklearn import svm
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [8]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [6]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x : iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [20]:
model = SVC()
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.9555555555555556

# Approach 2: Use K Fold Cross validation

In [17]:
from sklearn.model_selection import cross_val_score

In [22]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [23]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [24]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [27]:
import numpy as np

In [30]:
kernels = ['linear','rbf']
C = [1, 10, 20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval, C=cval, gamma='auto'), iris.data, iris.target)
        avg_scores[kval+ '_' +str(cval)] = np.average(cv_scores)
avg_scores

{'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668}

# Approach 3: Use GridSearchCV

In [31]:
features = {'kernel': ('linear','rbf'),
           'C': [1, 10, 20]}

In [32]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), features, cv=5, return_train_score=False)
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00214047, 0.00079918, 0.00057878, 0.00060692, 0.00038018,
        0.00050654]),
 'std_fit_time': array([0.00191322, 0.00039966, 0.00038409, 0.0008124 , 0.00046651,
        0.00045218]),
 'mean_score_time': array([0.00139632, 0.00040174, 0.00032701, 0.00040112, 0.        ,
        0.00074735]),
 'std_score_time': array([0.00207055, 0.00049203, 0.00041696, 0.00049127, 0.        ,
        0.00063753]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'}],


In [34]:
df1 = pd.DataFrame(clf.cv_results_)
df1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00214,0.001913,0.001396,0.002071,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000799,0.0004,0.000402,0.000492,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000579,0.000384,0.000327,0.000417,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
3,0.000607,0.000812,0.000401,0.000491,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.00038,0.000467,0.0,0.0,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.000507,0.000452,0.000747,0.000638,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [37]:
df1 = df1[['param_C','param_kernel','mean_test_score']]
df1

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,1,rbf,0.98
2,10,linear,0.973333
3,10,rbf,0.98
4,20,linear,0.966667
5,20,rbf,0.966667


In [38]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [39]:
clf.best_score_

0.9800000000000001

In [40]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_routed_params_for_fit',
 '_get_scorers',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run

#### Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [46]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), features, cv=5, n_iter=3, return_train_score=False)
rs.fit(iris.data, iris.target)
df2 = pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]
df2

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.98
1,20,rbf,0.966667
2,10,linear,0.973333


### How about different models with different hyperparameters?

In [47]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [71]:
model_paras = {
    'svm' : {
        'model' : SVC(gamma='auto'),
        'paras' : {
            'kernel' : ('linear', 'rbf'),
            'C' : [1, 10, 20]
        }
    },
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'paras' : {
            'n_estimators' : [1, 5, 10]
        }
    },
    'logistic_regression' : {
        'model' : LogisticRegression(n_jobs=4),
        'paras' : {
            'C' : [1, 5, 10]
        }
    }
}

In [72]:
scores  = []

for a,b in model_paras.items():
    clf = GridSearchCV(b['model'], b['paras'], cv=5,n_jobs=4, return_train_score=False)
    clf.fit(iris.data, iris.target)
clf.best_score_

0.9800000000000001

In [73]:
scores  = []

for a,b in model_paras.items():
    clf = GridSearchCV(b['model'], b['paras'], cv=5, n_jobs=4,  return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model' : a,
        'best_scores' : clf.best_score_,
        'best_paras' : clf.best_params_
    })
    
df3 = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df3

Unnamed: 0,model,best_score,best_params
0,svm,,
1,random_forest,,
2,logistic_regression,,


In [None]:
scores = []

for model_name, mp in model_paras.items():
    clf =  GridSearchCV(mp['model'], mp['paras'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_paras': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_paras'])
df