In [1]:
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing, tree
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, plot_confusion_matrix, 
                             precision_score, recall_score, classification_report)
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Normalizer, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
nyc_tree = pd.read_csv('data/nyc_tree_cleaned.csv')

In [3]:
nyc_tree.sample(7)

Unnamed: 0,health,tree_dbh,on_curb,steward,guards
325352,2,4,1,,
517512,2,5,1,,
587958,2,2,1,1or2,Helpful
489229,1,24,1,,
96938,2,3,1,,
398810,2,27,1,,
199635,2,17,1,,


In [4]:
nyc_tree.drop(columns=['guards'], axis=1, inplace=True)

In [5]:
y = nyc_tree.health
X = nyc_tree.drop(columns=['health'], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
nrmlz_column_transformer = ColumnTransformer(transformers=[
    ('normalizer', preprocessing.MinMaxScaler(), ['tree_dbh'])
])

In [8]:
ohe_column_transformer = ColumnTransformer(transformers=[
    ('one_hot_encoder', OneHotEncoder(sparse=False), ['steward'])
])#, remainder='passthrough')

In [9]:
feature_union = FeatureUnion(transformer_list=[
    ('normalize_feature', nrmlz_column_transformer),
    ('encoded_features', ohe_column_transformer)
])

In [10]:
dt_train_pipeline = Pipeline([('column_transformer_normalize', nrmlz_column_transformer)])#,
                        #('dt', DecisionTreeClassifier(random_state=42))])

In [11]:
dt_train_pipeline.fit_transform(X_train, y_train)[:9]

array([[0.21875],
       [0.28125],
       [0.4375 ],
       [0.53125],
       [0.53125],
       [0.28125],
       [0.375  ],
       [0.84375],
       [0.15625]])

In [12]:
for value in X_train.tree_dbh[:9]:
    print((value-X_train.tree_dbh.min()) / (X_train.tree_dbh.max()-X_train.tree_dbh.min()))

0.21875
0.28125
0.4375
0.53125
0.53125
0.28125
0.375
0.84375
0.15625


In [13]:
dt_train_other = Pipeline([('one_hot_encoder', ohe_column_transformer)])
dt_train_other.fit_transform(X_train)

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [14]:
X_train.iloc[[0]]

Unnamed: 0,tree_dbh,on_curb,steward
423833,7,1,


In [15]:
X_train.steward.value_counts()

None       355534
1or2       106579
3or4        14294
4orMore      1187
Name: steward, dtype: int64

In [16]:
dt_train_pipeline_combo = Pipeline(steps=[('feature_union', feature_union)])#,
                                          #('decison_tree', DecisionTreeClassifier(random_state=42))])

In [17]:
dt_train_fitted_transformed = dt_train_pipeline_combo.fit_transform(X_train)
dt_test_fitted_transformed = dt_train_pipeline_combo.fit_transform(X_test)

In [18]:
dt_train_fitted_transformed[0]

array([0.21875, 0.     , 0.     , 0.     , 1.     ])

In [19]:
dt_classifier_baseline = DecisionTreeClassifier(random_state=42)
dt_classifier_baseline.fit(dt_train_fitted_transformed, y_train)

DecisionTreeClassifier(random_state=42)

In [20]:
dt_baseline_predictions = dt_classifier_baseline.predict(dt_test_fitted_transformed)

In [21]:
# dt_pipeline.score(X_test, y_test)

In [22]:
import collections
collections.Counter(dt_baseline_predictions)

Counter({2: 159199})

In [23]:
#X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [24]:
def score_retreiver(true_results, predictions):
    precision = precision_score(true_results, predictions, average='weighted') * 100
    recall = recall_score(true_results, predictions, average='weighted') * 100
    accuracy = accuracy_score(true_results, predictions) * 100
    f1 = f1_score(true_results, predictions, average='weighted') * 100
    return precision, recall, accuracy, f1

### Decision Tree

In [25]:
# dt_classifier = DecisionTreeClassifier(random_state=42)
# dt_classifier_baseline = dt_classifier
# dt_classifier_score = cross_val_score(dt_classifier_baseline, X_train, y_train, cv=5)
# mean_dt_score = np.mean(dt_classifier_score)

In [26]:
print('Test Scores')
print(f'Decision Tree Baseline Precision: {score_retreiver(y_test, dt_baseline_predictions)[0]}')
print(f'Decision Tree Baseline Recall: {score_retreiver(y_test, dt_baseline_predictions)[1]}')
print(f'Decision Tree Baseline Accuracy: {score_retreiver(y_test, dt_baseline_predictions)[2]}')
print(f'Decision Tree Baseline F1: {score_retreiver(y_test, dt_baseline_predictions)[3]}')

Test Scores
Decision Tree Baseline Precision: 65.19731435363425


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Decision Tree Baseline Recall: 80.74485392496183
Decision Tree Baseline Accuracy: 80.74485392496183


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Decision Tree Baseline F1: 72.14292737839342


In [27]:
print(classification_report(y_test, dt_baseline_predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6791
           1       0.00      0.00      0.00     23863
           2       0.81      1.00      0.89    128545

    accuracy                           0.81    159199
   macro avg       0.27      0.33      0.30    159199
weighted avg       0.65      0.81      0.72    159199



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
path = DecisionTreeClassifier(random_state=42).cost_complexity_pruning_path(dt_train_fitted_transformed, y_train)
## Gather all alphas except for the last one which would return only the root node
ccp_alphas = path.ccp_alphas[:-1]

In [29]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': list(range(1,7)),
    #'ccp_alpha': list(ccp_alphas)
}

In [31]:
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search_results = dt_grid_search.fit(dt_train_fitted_transformed, y_train)

In [None]:
12dt_grid_search.best_params_

In [None]:
dt_classifier_best_parameters = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=4)

In [None]:
dt_classifier_best_parameters.fit(X_train, y_train)
dt_best_parameters_predictions = dt_classifier_best_parameters.predict(X_test)
print('Test Scores')
print(f'Decision Tree Tuned Precision: {score_retreiver(y_test, dt_best_parameters_predictions)[0]}')
print(f'Decision Tree Tuned Recall: {score_retreiver(y_test, dt_best_parameters_predictions)[1]}')
print(f'Decision Tree Tuned Accuracy: {score_retreiver(y_test, dt_baseline_predictions)[2]}')
print(f'Decision Tree Tuned F1: {score_retreiver(y_test, dt_baseline_predictions)[3]}')

In [None]:
print(classification_report(y_test, dt_best_parameters_predictions))

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(dt_classifier_best_parameters)
plt.show()

### Random Forest 

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier_baseline = rf_classifier
rf_classifier_score = cross_val_score(rf_classifier_baseline, X_train, y_train, cv=5)
mean_rf_score = np.mean(rf_classifier_score)