# DecisionTree Modeling

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_text
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import push_results as pr



In [2]:
infile1 = open('../data/processed/X_train2_trans.pickle','rb')
X_train = pickle.load(infile1)
infile1.close()

infile2 = open('../data/processed/y_train_trans.pickle','rb')
y_train = pickle.load(infile2)
infile2.close()

# infile3 = open('../data/model_results/model_results.pickle','rb')
# results_dict = pickle.load(infile3)
# infile3.close()

In [3]:
clf = DecisionTreeClassifier(random_state = 31)

In [4]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=31, splitter='best')

In [5]:
features = X_train.columns.tolist()
features;

In [6]:
r = export_text(clf, feature_names= features)
# print(r)

In [7]:
clf.tree_.node_count

1677

In [8]:
y_pred = clf.predict(X_train)

In [9]:
clf.score(X_train, y_train)

0.999484270242393

In [10]:
score = clf.score(X_train, y_train)

# results_dict.update({'dtree': ['DecisionTreeClassifier',
#                           'Default',
#                           score]})

In [11]:
pr.push_results('dectree_cv_v2', 'DecisionTreeClassifier', 'Default', clf.score(X_train, y_train))

## Cross validation

In [12]:
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring= 'accuracy')

In [13]:
np.average(scores)

0.36722397307836124

In [14]:
pr.push_results('dectree_cv_v2', 'DecisionTreeClassifier', 'Cross Validation', np.average(scores))

# results_dict.update({'dtree_cv': ['DecisionTreeClassifier',
#                           'Cross Validation',
#                           np.average(scores)]})

## Grid search

In [15]:
tree_clf = DecisionTreeClassifier()

params = {'criterion': ['gini', 'entropy'],
          'splitter': ['best', 'random'],
          'max_depth': range(2, 12),
          'min_samples_split': range(5, 100, 5),
          'max_features': [None, 'auto', 'sqrt', 'log2'], 
          'max_leaf_nodes': range(5, 100, 5)}

gridsearch = GridSearchCV(estimator = tree_clf, 
                          param_grid = params,
                          n_jobs = -1,
                          verbose = 1,
                          cv = 5,
                          scoring = 'accuracy',
                          return_train_score= True)

gridsearch = gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 57760 candidates, totalling 288800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 4160 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 9760 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 16960 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 25760 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 36160 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 48160 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 61760 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done 76960 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 93760 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 112160 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 120488 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 125888 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 13

In [16]:
gridsearch.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=30,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=90,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='random')

In [17]:
gridsearch.best_score_

0.5018221050107888

In [18]:
results = pd.DataFrame(gridsearch.cv_results_)
results[['mean_train_score', 'std_train_score', 'params']].sort_values(by='mean_train_score', 
                                                                       ascending = False).head(10)

Unnamed: 0,mean_train_score,std_train_score,params
23788,0.631125,0.011324,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."
26676,0.630996,0.011656,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26678,0.630093,0.012425,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
23790,0.627644,0.011809,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."
26680,0.627128,0.013278,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26638,0.626355,0.011488,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
23750,0.625581,0.009621,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."
26640,0.625065,0.011711,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26682,0.624291,0.009716,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
23792,0.624162,0.00976,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."


In [19]:
pr.push_results('dectree_gs_v2', 'DecisionTreeClassifier', 'Grid Search', gridsearch.best_score_)

# results_dict.update({'dtree_gs': ['DecisionTreeClassifier',
#                           'Grid Search',
#                           gridsearch.best_score_]})

In [106]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()