In [26]:
import numpy as np 
import pandas as pd 
from sklearn import datasets
from sklearn import tree
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [27]:
#Loading data from sklearn package
iris = datasets.load_iris()

DECISION TREE

Compare the difference in performance using both the available criteria:gini and entropy

In [28]:
#creating table to store accuracy results based on criterion
TreeScoreTable = pd.DataFrame(columns=['Criterion', 'Accuracy'])
#criteria to be tested
TreeCrit = ['gini', 'entropy'] 
#fill column 'Criterion' 
TreeScoreTable['Criterion'] = TreeCrit       

#determining where to separe test and train data
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)        

j = 0

for i in TreeCrit: 
    #Initialize decision tree
    TreeModel = make_pipeline(preprocessing.StandardScaler(), tree.DecisionTreeClassifier(criterion=i)) 
    #Cross validation for decision tree
    TreeScoreTable.iloc[j, 1] = cross_val_score(TreeModel, iris.data, iris.target, cv = cv).mean()        
    j += 1     
    
print(TreeScoreTable)


  Criterion  Accuracy
0      gini  0.955556
1   entropy  0.942222


 Tune the hyper-parameters of tree decision model by a grid search

In [29]:
from sklearn.model_selection import GridSearchCV, train_test_split
#criteria to be tested
criteria = ["gini", "entropy"]
#min sample split to be tested
min_sample_split_range = [2,10, 20] 
#max depth to be tested
max_depth_range = [None, 2, 5, 10] 
#min samples in the leaf to be tested
min_samples_leaf_range = [1, 5, 10] 
#min leaf nodes to be tested
min_leaf_nodes_range = [None, 5, 10, 20]    

param_grid = {"criterion": criteria,
              "min_samples_split": min_sample_split_range,
              "max_depth": max_depth_range,
              "min_samples_leaf": min_samples_leaf_range,
              "max_leaf_nodes": min_leaf_nodes_range
                }
#setting grid with estimator
grid = GridSearchCV(estimator=tree.DecisionTreeClassifier(), 
                    param_grid=param_grid, 
                    cv = 5, 
                    scoring='accuracy', 
                    refit=True)     
#creating preprocessing
tree_model = make_pipeline(preprocessing.StandardScaler(), grid)    
#fitting data
tree_model.fit(iris.data, iris.target)      

print("Accuracy of the tuned model: %.4f" %grid.best_score_)
print(grid.best_params_)

Accuracy of the tuned model: 0.9733
{'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 20}


Tune the hyper-parameters of tree decision model by a random search

In [30]:
from sklearn.model_selection import RandomizedSearchCV
random_search =RandomizedSearchCV(estimator=tree.DecisionTreeClassifier(), 
                    param_distributions=param_grid, 
                    cv = 5,
                    scoring='accuracy', 
                    refit=True)
#fit the RandomizedSearchCV
random_search.fit(iris.data,iris.target)

print("Accuracy of the tuned model: %.4f" %random_search.best_score_)
print(random_search.best_params_)

Accuracy of the tuned model: 0.9667
{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_leaf_nodes': 20, 'max_depth': 5, 'criterion': 'gini'}


Tune the hyper-parameters of SVM model by a grid search

In [31]:
from sklearn import svm
#types of kernels to be tested
kernel_types = ["linear", "poly", "rbf", "sigmoid"]   
 #range of C to be tested
C_range = [0.01, 0.1, 1, 10, 100, 1000]
#degrees to be tested
degree_range = [1, 2, 3, 4, 5, 6]                       

param_grid = {"kernel": kernel_types,
              "C": C_range,
              "degree": degree_range,
              }         

grid = GridSearchCV(estimator = svm.SVC(), 
                    param_grid = param_grid, 
                    cv = 5, 
                    scoring = 'accuracy', 
                    refit = True)   

svm_model = make_pipeline(preprocessing.StandardScaler(), grid)    
svm_model.fit(iris.data, iris.target)       

print("Accuracy of the tuned model: %.4f" %grid.best_score_)
print(grid.best_params_)


Accuracy of the tuned model: 0.9733
{'C': 10, 'degree': 1, 'kernel': 'rbf'}


Tune the hyper-parameters of SVM model by a random search

In [32]:
random_search =RandomizedSearchCV(estimator=svm.SVC(), 
                    param_distributions=param_grid, 
                    cv = 5,
                    scoring='accuracy', 
                    refit=True)
#fit the RandomizedSearchCV
random_search.fit(iris.data,iris.target)

print("Accuracy of the tuned model: %.4f" %random_search.best_score_)
print(random_search.best_params_)

Accuracy of the tuned model: 0.9733
{'kernel': 'linear', 'degree': 5, 'C': 10}
