In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [7]:
# Generating synthetic dataset from make classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_features=10,
                           n_samples=1000,
                           n_informative=8,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=2,
                           random_state=42
                          )



# Method 1 : Evaluate Decision Tree model using train, test and tune parameters by trial and error

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(criterion="entropy", max_depth=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81       130
           1       0.77      0.85      0.81       120

    accuracy                           0.81       250
   macro avg       0.81      0.81      0.81       250
weighted avg       0.81      0.81      0.81       250



In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(criterion="gini", max_depth=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       130
           1       0.81      0.82      0.82       120

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



## Method 2 : Cross Validation score

In [23]:
from sklearn.model_selection import cross_val_score

criterion = ["gini","entropy"]
max_depth = [5,10,15]

avg_scores = {}

for c in criterion:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion = c, max_depth = d)
        scores_list = cross_val_score(clf ,X ,y ,cv=5)
        avg_scores[c + "_" + str(d)] = np.average(scores_list)
avg_scores

{'gini_5': 0.779,
 'gini_10': 0.7889999999999999,
 'gini_15': 0.794,
 'entropy_5': 0.781,
 'entropy_10': 0.7899999999999999,
 'entropy_15': 0.8009999999999999}

# Method 3 : GridSearchCV

In [28]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV( 
    DecisionTreeClassifier(),
    {
        'criterion':["gini","entropy"],
        "max_depth":[5,10,15]
    },
    cv=5,
    return_train_score=False
)
clf.fit(X, y)

In [34]:
# total 6 model
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015554,0.008964,0.001213,0.001187,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.775,0.815,0.75,0.805,0.77,0.783,0.023791,4
1,0.017807,0.007272,0.001198,0.001167,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.775,0.745,0.79,0.785,0.81,0.781,0.021307,5
2,0.01947,0.007693,0.003525,0.006099,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.8,0.715,0.8,0.815,0.815,0.789,0.037603,3
3,0.021955,0.00575,0.0004,0.00049,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.78,0.765,0.815,0.775,0.78,0.018439,6
4,0.026505,0.009279,0.0004,0.000491,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.78,0.785,0.835,0.775,0.795,0.794,0.021541,2
5,0.027679,0.009403,0.000401,0.000491,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.78,0.79,0.845,0.8,0.845,0.812,0.027677,1


In [38]:
df[["param_criterion","param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.783
1,gini,10,0.781
2,gini,15,0.789
3,entropy,5,0.78
4,entropy,10,0.794
5,entropy,15,0.812


In [40]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [42]:
clf.best_estimator_

In [50]:
from sklearn import svm

model_params = {
    'decision_tree' : {
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':["gini","entropy"],
            "max_depth":[5,10,15]
        }
    },
    "svm":{
        'model':svm.SVC(gamma="auto"),
        "params":{
            "C":[1,10,20],
            "kernel":["rbf","linear"]
        }
    }
}

scores = []

for key,val in model_params.items():
    clf = GridSearchCV(val['model'],val["params"],cv=5,return_train_score=False)
    clf.fit(X,y)
    scores.append({
    "model":key,
    "best_score":clf.best_score_,
    "best_params":clf.best_params_
    })

scores

[{'model': 'decision_tree',
  'best_score': 0.8130000000000001,
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': 0.9260000000000002,
  'best_params': {'C': 1, 'kernel': 'rbf'}}]

In [52]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.813,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.926,"{'C': 1, 'kernel': 'rbf'}"
