# 04C Try Decision Trees

## Imports

In [20]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import random

random.seed(42)

In [4]:
def get_metrics(y_true, y_predict):
    matrix_def = [['tn','fp'], ['fn','tp']]
    matrix = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    misclass = 1-accuracy
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    print('Matrix Definition')
    print(np.array(matrix_def))
    print('')
    print('Confusion Matrix')
    print(matrix)
    print('')
    print('METRICS')
    print(f'accuracy: {accuracy}')
    print(f'misclass: {misclass}')
    print(f'sensitivity: {sensitivity}')
    print(f'specificity: {specificity}')
    #return accuracy, misclass, sensitivity, specificity, precision

## Pull the data in again

In [5]:
!ls Data

X_fulltext.pkl     df_features_sc.pkl y_train.pkl
df_X_combined.pkl  raw.json


In [6]:
with open('./Data/df_X_combined.pkl','rb') as f:
    X = pickle.load(f)

In [7]:
with open('./Data/y_train.pkl', 'rb') as f:
    y = pickle.load(f)

## Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## Try Decision Trees

In [11]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
(tree.score(X_train, y_train), tree.score(X_test, y_test))

(1.0, 0.8764805414551607)

In [16]:
tree.tree_.max_depth

18

## Try Decision Tree with GridSearchCV

In [None]:
# tree = DecisionTreeClassifier()
# parameters = {
#     'max_depth':list(range(3,17,2)),
#     'min_samples_split': list(range(2,4)),
#     'min_samples_leaf': list(range(1,4)),
# }
# treeGS = GridSearchCV(tree,parameters,cv=3)

In [41]:
tree = DecisionTreeClassifier()
parameters = {
    'max_depth':list(range(3,9,2)),
}
treeGS = GridSearchCV(tree,parameters, cv=3)

In [42]:
treeGS.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5, 7]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [43]:
(treeGS.score(X_train, y_train), treeGS.score(X_test, y_test))

(0.9456127628716461, 0.8730964467005076)

In [44]:
treeGS.best_params_

{'max_depth': 7}