In [1]:
# Look up Navie Bayes Classifier classification algorithm 
# R- LR/PR, LR with LSE tech , LR with GD tech , l1 and l2 regularization tech
# C- LR,DT,RF,KNN,SVM and various others. 

# Goal - Finding the best model for a problem dataset and performing hyper parameter tuning using GridSearchCV / RandomSearchCV. 

from sklearn import svm,datasets
import pandas as pd
iris=datasets.load_iris()
df=pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower']=iris.target
df['flower']=df['flower'].apply(lambda x:iris.target_names[x])

In [2]:
# df

In [3]:
# Approach 1 - Split to train and test and manually tune the parameters by trial and error

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.3)

In [4]:
model=svm.SVC(kernel='rbf',C=3,gamma='auto')
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.9555555555555556

In [5]:
# Approach 2 : Using kFold cross validation technique 

from sklearn.model_selection import cross_val_score

In [6]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data,iris.target,cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [7]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data,iris.target,cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [8]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data,iris.target,cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [9]:
# Above approach is tiresome and very manual. We can use a for loop as an alternative 

import numpy as np
kernels=['rbf','linear'] # kernels 
C=[1,10,30] # regularization 
avg_score={} # empty dictionary 
for kval in kernels:
    for cval in C:
        cv_scores=cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data,iris.target,cv=5)
        avg_score[kval+'_'+str(cval)]=np.average(cv_scores)
        
print(avg_score)

# from the above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance.

{'rbf_1': 0.9800000000000001, 'rbf_10': 0.9800000000000001, 'rbf_30': 0.96, 'linear_1': 0.9800000000000001, 'linear_10': 0.9733333333333334, 'linear_30': 0.96}


In [10]:
# Approach 3: Use GridSearchCV

# GridSearchCV does exactly same thing as for loop above but it is in a single line of code 
from sklearn.model_selection import GridSearchCV

gs=GridSearchCV(svm.SVC(),{
    'C':[1,10,30,50],
    'kernel':['rbf','linear','sigmoid','poly'],
    'gamma':['scale','auto'] 
},cv=5)
gs.fit(iris.data,iris.target)
# gs.cv_results_

In [11]:
print(dir(gs))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_n_features', '_check_refit_for_multimetric', '_estimator_type', '_format_results', '_get_param_names', '_get_tags', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_run_search', '_select_best_index', '_validate_data', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'classes_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'fit', 'get_params', 'inverse_transform', 'multimetric_', 'n_features_in_', 'n_jobs', 'n_splits_', 'param_grid', 'pre_dispatch', 'predict', 'pr

In [12]:
df=pd.DataFrame(gs.cv_results_)
df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_gamma', 'param_kernel', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [13]:
df[['mean_fit_time','param_C','param_gamma','param_kernel','mean_test_score']]

Unnamed: 0,mean_fit_time,param_C,param_gamma,param_kernel,mean_test_score
0,0.00074,1,scale,rbf,0.966667
1,0.000611,1,scale,linear,0.98
2,0.001298,1,scale,sigmoid,0.066667
3,0.00094,1,scale,poly,0.98
4,0.000661,1,auto,rbf,0.98
5,0.000638,1,auto,linear,0.98
6,0.001259,1,auto,sigmoid,0.093333
7,0.001613,1,auto,poly,0.966667
8,0.000677,10,scale,rbf,0.98
9,0.000683,10,scale,linear,0.973333


In [14]:
# print(dir(gs))

In [15]:
gs.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}

In [16]:
gs.best_score_ 

0.9800000000000001

In [17]:
# Use RandomSearchCV to reduce number of iterations and with random combinations of parameters.
# This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of compution.

from sklearn.model_selection import RandomizedSearchCV
rs=RandomizedSearchCV(svm.SVC(),{
    'C':[1,10,30,50],
    'kernel':['rbf','linear','sigmoid','poly'],
    'gamma':['scale','auto']
},cv=5)
rs.fit(iris.data,iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','param_gamma','mean_test_score']]

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score
0,30,rbf,scale,0.973333
1,30,poly,auto,0.953333
2,50,linear,scale,0.966667
3,50,rbf,auto,0.96
4,10,sigmoid,auto,0.093333
5,50,sigmoid,auto,0.093333
6,1,rbf,auto,0.98
7,30,sigmoid,scale,0.04
8,50,sigmoid,scale,0.033333
9,50,poly,auto,0.96


In [18]:
rs.best_params_

{'kernel': 'rbf', 'gamma': 'auto', 'C': 1}

In [19]:
rs.best_score_

0.9800000000000001

In [20]:
# How about different models with different hyperparameters? 

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params={
    'svm':{
        'model':svm.SVC(),
        'params':{
            'C':[1,10,20],
            'kernel':['rbf','linear','sigmoid'],
            
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(),
        'params':{
            'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'C':[1,5,10],
            'multi_class':['auto', 'ovr', 'multinomial']
            
        }
    }
    
}

In [21]:
import warnings
warnings.filterwarnings('ignore') # This will ignore all warnings in your Jupyter notebook (supress warnings for a clean notebook)

In [22]:
scores=[]

for model_name,m_p in model_params.items():
#     print(model_name,m_p)
    gs=GridSearchCV(m_p['model'],m_p['params'],cv=5)
    gs.fit(iris.data,iris.target)
    scores.append({
        'model':model_name,
        'best_score':gs.best_score_,
        'best_params':gs.best_params_
    })
df=pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.953333,{'n_estimators': 10}
2,logistic_regression,0.986667,"{'C': 1, 'multi_class': 'multinomial', 'solver..."


In [23]:
# Conclusion 

# Based on the above , I can conclude that Logistic regression with params C=1,multi_class=auto and solver=saga is the best model
# for solving my problem of iris flower classification. 

In [24]:
# Finding the best model and hyper parameters for sklearn digits dataset classification 

digits=datasets.load_digits()

In [25]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier


In [26]:
model_params={
    'svm':{
        'model':svm.SVC(),
        'params':{
            'C':[1,10,20],
            'kernel':['rbf','linear','sigmoid'],
            'gamma':['scale','auto']
            
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(),
        'params':{
            'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'C':[1,5,10],
            'multi_class':['auto', 'ovr', 'multinomial']
            
        }
    },
    
    'naive_bayes_gaussion':{
        'model':GaussianNB(),
        'params':{}
    },
    
    'naive_bayes_multinomial':{
        'model':MultinomialNB(),
        'params':{}
    },
    
    'decision tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['entropy','gini']
        }
    }
    
}

In [27]:
from sklearn.model_selection import GridSearchCV
import pandas as pd 
scores=[]
for model_name,m_p in model_params.items():
#     print(model_name,m_p)
    gs=GridSearchCV(m_p['model'],m_p['params'],cv=5)
    gs.fit(digits.data,digits.target)
    scores.append({
        'model':model_name,
        'best_score':gs.best_score_,
        'best_params':gs.best_params_
    })
df=pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.97385,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
1,random_forest,0.890405,{'n_estimators': 10}
2,logistic_regression,0.923787,"{'C': 1, 'multi_class': 'ovr', 'solver': 'sag'}"
3,naive_bayes_gaussion,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}
5,decision tree,0.809152,{'criterion': 'entropy'}


In [28]:
# Conclusion
# The clear winner of model selection is SVM (C=10,kernel='rbf',gamma='scale') for digits classification.

# Exercise 

# Pick a dataset of classification from kaggle.
# Perform hyper parameter tuning by applying various classification algorithms by using GridSearchCV/ RandomSearchCV
# Conclude which model gives you the best set of parameters with the accuracy score.


# Tomos disussion:

# L1 and L2 regularization ( Linear Regression problems )

# Unsupervised learning - K means clustering , Dimensionality reduction --> Principal component analyis. 