# DecisionTree Modeling

In [129]:
import pandas as pd
import numpy as np
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_text
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import push_results as pr

In [130]:
infile1 = open('../data/processed/X_train_trans.pickle','rb')
X_train = pickle.load(infile1)
infile1.close()

infile2 = open('../data/processed/y_train_trans.pickle','rb')
y_train = pickle.load(infile2)
infile2.close()

infile3 = open('../data/model_results/model_results.pickle','rb')
results_dict = pickle.load(infile3)
infile3.close()

In [131]:
clf = DecisionTreeClassifier(random_state = 31)

In [132]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=31, splitter='best')

In [133]:
features = X_train.columns.tolist()
features;

In [134]:
r = export_text(clf, feature_names= features)
# print(r)

In [135]:
clf.tree_.node_count

1575

In [136]:
y_pred = clf.predict(X_train)

In [137]:
clf.score(X_train, y_train)

1.0

In [138]:
score = clf.score(X_train, y_train)

# results_dict.update({'dtree': ['DecisionTreeClassifier',
#                           'Default',
#                           score]})

In [139]:
pr.push_results('dectree_cv', 'DecisionTreeClassifier', 'Default', clf.score(X_train, y_train))

## Cross validation

In [140]:
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring= 'accuracy')

In [141]:
np.average(scores)

0.35794829336039746

In [142]:
pr.push_results('dectree_def', 'DecisionTreeClassifier', 'Cross Validation', np.average(scores))

# results_dict.update({'dtree_cv': ['DecisionTreeClassifier',
#                           'Cross Validation',
#                           np.average(scores)]})

## Grid search

In [123]:
tree_clf = DecisionTreeClassifier()

params = {'criterion': ['gini', 'entropy'],
          'splitter': ['best', 'random'],
          'max_depth': range(2, 12),
          'min_samples_split': range(5, 100, 5),
          'max_features': [None, 'auto', 'sqrt', 'log2'], 
          'max_leaf_nodes': range(5, 100, 5)}

gridsearch = GridSearchCV(estimator = tree_clf, 
                          param_grid = params,
                          n_jobs = -1,
                          verbose = 1,
                          cv = 5,
                          scoring = 'accuracy',
                          return_train_score= True)

gridsearch = gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 57760 candidates, totalling 288800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 464 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 4400 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 10000 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 17200 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 26000 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 36400 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 48400 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 62000 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 77200 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done 94000 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 112400 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 132400 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154000 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1

In [124]:
gridsearch.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=5, max_features=None, max_leaf_nodes=75,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=80,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='random')

In [125]:
gridsearch.best_score_

0.4976784144489731

In [126]:
results = pd.DataFrame(gridsearch.cv_results_)
results[['mean_train_score', 'std_train_score', 'params']].sort_values(by='mean_train_score', 
                                                                       ascending = False).head(10)

Unnamed: 0,mean_train_score,std_train_score,params
26678,0.638604,0.010596,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26676,0.63783,0.009147,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26680,0.634607,0.011973,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
23788,0.632544,0.009375,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."
26682,0.631641,0.010234,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26638,0.631383,0.009954,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26640,0.631254,0.009579,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
26642,0.630868,0.011431,"{'criterion': 'gini', 'max_depth': 11, 'max_fe..."
23790,0.629965,0.011,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."
23750,0.628804,0.008842,"{'criterion': 'gini', 'max_depth': 10, 'max_fe..."


In [128]:
pr.push_results('dectree_gs', 'DecisionTreeClassifier', 'Grid Search', gridsearch.best_score_)

# results_dict.update({'dtree_gs': ['DecisionTreeClassifier',
#                           'Grid Search',
#                           gridsearch.best_score_]})

In [106]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()