[Reference](https://towardsdatascience.com/3-ways-to-tune-hyperparameters-of-machine-learning-models-with-python-cda64b62e0ac)

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

iris = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
X = iris.drop('species', axis=1)
y = iris['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [3]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)

print(f'Accuracy = {round(accuracy_score(y_test, preds), 2)}')
print()
print(confusion_matrix(y_test, preds))

Accuracy = 0.97

[[18  0  0]
 [ 0 12  1]
 [ 0  0  7]]


In [4]:
# 3 sets of hyperparameters
params_1 = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 10}
params_2 = {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 1000}
params_3 = {'criterion': 'gini', 'splitter': 'random', 'max_depth': 100}

# 3 separate models
model_1 = DecisionTreeClassifier(**params_1)
model_2 = DecisionTreeClassifier(**params_2)
model_3 = DecisionTreeClassifier(**params_3)

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

# 3 separate prediction sets
preds_1 = model_1.predict(X_test)
preds_2 = model_3.predict(X_test)
preds_3 = model_2.predict(X_test)

print(f'Accuracy on Model 1 = {round(accuracy_score(y_test, preds_1), 5)}')
print(f'Accuracy on Model 2 = {round(accuracy_score(y_test, preds_2), 5)}')
print(f'Accuracy on Model 3 = {round(accuracy_score(y_test, preds_3), 5)}')

Accuracy on Model 1 = 0.97368
Accuracy on Model 2 = 0.92105
Accuracy on Model 3 = 0.97368


In [5]:
# Define parameter possibilities as lists
p_criterion = ['gini', 'entropy']
p_splitter = ['best', 'random']
p_max_depth = [1, 10, 100, 1000]
# The scores will go here
results = []

# Nested loops - we need to test for all combinations
for criterion in p_criterion:
    for splitter in p_splitter:
        for max_depth in p_max_depth:
            # Train the model
            model = DecisionTreeClassifier(
                criterion=criterion,
                splitter=splitter,
                max_depth=max_depth
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            # Append current results
            results.append({
                'Accuracy': round(accuracy_score(y_test, preds), 5),
                'P_Criterion': criterion,
                'P_Splitter': splitter,
                'P_MaxDepth': max_depth
            })
            
# Convert to Pandas DataFrame and sort descendingly by accuracy
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)
results

Unnamed: 0,Accuracy,P_Criterion,P_Splitter,P_MaxDepth
1,0.97368,gini,best,10
2,0.97368,gini,best,100
3,0.97368,gini,best,1000
7,0.97368,gini,random,1000
9,0.97368,entropy,best,10
10,0.97368,entropy,best,100
11,0.97368,entropy,best,1000
15,0.97368,entropy,random,1000
5,0.94737,gini,random,10
6,0.94737,gini,random,100


In [6]:
model = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 10, 100, 1000]
}

clf = GridSearchCV(
    estimator=model, 
    param_grid=params, 
    cv=10,  # 10-fold cross validation
    n_jobs=-1  # run in parallel
)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                    

In [7]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002831,0.001305,0.001626,0.001012,gini,1,best,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.666667,0.666667,0.636364,0.636364,0.636364,0.636364,0.636364,0.727273,0.727273,0.545455,0.651515,0.04933,15
1,0.002271,0.000148,0.001612,0.001013,gini,1,random,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.666667,0.666667,0.636364,0.636364,0.636364,0.636364,0.727273,0.727273,0.727273,0.727273,0.678788,0.041105,13
2,0.003131,0.00161,0.001442,0.000217,gini,10,best,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.916667,1.0,1.0,0.818182,0.909091,0.909091,0.818182,1.0,1.0,1.0,0.937121,0.070747,5
3,0.003001,0.002152,0.001326,6.3e-05,gini,10,random,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.916667,1.0,1.0,0.818182,0.909091,0.909091,1.0,1.0,1.0,0.909091,0.946212,0.059887,4
4,0.002965,0.001673,0.001405,0.000174,gini,100,best,"{'criterion': 'gini', 'max_depth': 100, 'split...",0.916667,1.0,1.0,0.818182,0.909091,0.909091,0.818182,1.0,1.0,1.0,0.937121,0.070747,5


In [8]:
cv_results = cv_results[['mean_test_score', 'param_criterion', 'param_splitter', 'param_max_depth']]
cv_results.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,param_criterion,param_splitter,param_max_depth
13,0.94697,entropy,random,100
7,0.946212,gini,random,1000
15,0.946212,entropy,random,1000
3,0.946212,gini,random,10
2,0.937121,gini,best,10
4,0.937121,gini,best,100
6,0.937121,gini,best,1000
5,0.936364,gini,random,100
11,0.919697,entropy,random,10
10,0.918939,entropy,best,10
