In [1]:
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing, tree
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, f1_score, plot_confusion_matrix, 
                             precision_score, recall_score, classification_report)
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
nyc_tree = pd.read_csv('data/nyc_tree_cleaned.csv')

In [3]:
nyc_tree.sample(7)

Unnamed: 0,health,tree_dbh,on_curb,steward,guards
11222,2,2,1,,Helpful
396223,2,2,1,,
328371,2,6,1,,
454204,2,28,1,,
580582,1,14,1,,
93433,1,5,0,,
303539,0,12,1,,


In [4]:
y = nyc_tree.health
X = nyc_tree.drop(columns=['health'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
impute_column_transformer = ColumnTransformer(transformers=[
    ('imputer', SimpleImputer(strategy='most_frequent'), ['guards'])
])

In [7]:
nrmlz_column_transformer = ColumnTransformer(transformers=[
    ('normalizer', preprocessing.MinMaxScaler(), ['tree_dbh'])
])

In [8]:
ohe_column_transformer = ColumnTransformer(transformers=[
    ('one_hot_encoder', OneHotEncoder(sparse=False), ['steward', 'guards'])
])

In [9]:
feature_union = FeatureUnion(transformer_list=[
    ('impute_values', impute_column_transformer),
    ('normalize_feature', nrmlz_column_transformer),
    ('encoded_features', ohe_column_transformer)
])

In [10]:
dt_pipeline = Pipeline(steps=[('feature_union', feature_union)])

In [11]:
dt_train_fitted_transformed = dt_pipeline.fit_transform(X_train)
## Can't just fit(X_test)
dt_test_fitted_transformed = dt_pipeline.fit_transform(X_test)

In [12]:
dt_train_fitted_transformed

array([['None', 0.21875, 0.0, ..., 1.0, 0.0, 0.0],
       ['None', 0.28125, 0.0, ..., 1.0, 0.0, 0.0],
       ['None', 0.4375, 0.0, ..., 1.0, 0.0, 0.0],
       ...,
       ['None', 0.125, 1.0, ..., 1.0, 0.0, 0.0],
       ['None', 0.71875, 0.0, ..., 1.0, 0.0, 0.0],
       ['None', 0.375, 0.0, ..., 1.0, 0.0, 0.0]], dtype=object)

In [21]:
nyc_tree.iloc[0:1]

Unnamed: 0,health,tree_dbh,on_curb,steward,guards
0,1,3,1,,


In [13]:
dt_train_fitted_transformed[0]

array(['None', 0.21875, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
      dtype=object)

In [13]:
dt_classifier_baseline = DecisionTreeClassifier(random_state=42)
dt_classifier_baseline.fit(dt_train_fitted_transformed, y_train)

ValueError: could not convert string to float: 'None'

In [15]:
dt_baseline_predictions = dt_classifier_baseline.predict(dt_test_fitted_transformed)

In [20]:
#X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [22]:
def score_retreiver(true_results, predictions):
    precision = precision_score(true_results, predictions, average='weighted') * 100
    recall = recall_score(true_results, predictions, average='weighted') * 100
    accuracy = accuracy_score(true_results, predictions) * 100
    f1 = f1_score(true_results, predictions, average='weighted') * 100
    return precision, recall, accuracy, f1

### Decision Tree

In [None]:
# dt_classifier = DecisionTreeClassifier(random_state=42)
# dt_classifier_baseline = dt_classifier
# dt_classifier_score = cross_val_score(dt_classifier_baseline, X_train, y_train, cv=5)
# mean_dt_score = np.mean(dt_classifier_score)

In [23]:
print('Test Scores')
print(f'Decision Tree Baseline Precision: {score_retreiver(y_test, dt_baseline_predictions)[0]}')
print(f'Decision Tree Baseline Recall: {score_retreiver(y_test, dt_baseline_predictions)[1]}')
print(f'Decision Tree Baseline Accuracy: {score_retreiver(y_test, dt_baseline_predictions)[2]}')
print(f'Decision Tree Baseline F1: {score_retreiver(y_test, dt_baseline_predictions)[3]}')

Test Scores
Decision Tree Baseline Precision: 66.4531551697536
Decision Tree Baseline Recall: 81.05223775323074
Decision Tree Baseline Accuracy: 81.05223775323074
Decision Tree Baseline F1: 72.58296671381167


In [24]:
print(classification_report(y_test, dt_baseline_predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6789
           1       0.05      0.00      0.00     24084
           2       0.81      1.00      0.90    132170

    accuracy                           0.81    163043
   macro avg       0.29      0.33      0.30    163043
weighted avg       0.66      0.81      0.73    163043



In [25]:
dt_classifier = DecisionTreeClassifier(random_state=42)
path = dt_classifier.cost_complexity_pruning_path(dt_train_fitted_transformed, y_train)
## Gather all alphas except for the last one which would return only the root node
ccp_alphas = path.ccp_alphas[:-1]

In [None]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': list(range(1,7)),
    'ccp_alpha': list(ccp_alphas)
}

In [None]:
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid_search = dt_grid_search.fit(X_train, y_train)

In [None]:
dt_grid_search.best_params_

In [None]:
dt_classifier_best_parameters = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=4)

In [None]:
dt_classifier_best_parameters.fit(X_train, y_train)
dt_best_parameters_predictions = dt_classifier_best_parameters.predict(X_test)
print('Test Scores')
print(f'Decision Tree Tuned Precision: {score_retreiver(y_test, dt_best_parameters_predictions)[0]}')
print(f'Decision Tree Tuned Recall: {score_retreiver(y_test, dt_best_parameters_predictions)[1]}')
print(f'Decision Tree Tuned Accuracy: {score_retreiver(y_test, dt_baseline_predictions)[2]}')
print(f'Decision Tree Tuned F1: {score_retreiver(y_test, dt_baseline_predictions)[3]}')

In [None]:
print(classification_report(y_test, dt_best_parameters_predictions))

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(dt_classifier_best_parameters)
plt.show()

### Random Forest 

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier_baseline = rf_classifier
rf_classifier_score = cross_val_score(rf_classifier_baseline, X_train, y_train, cv=5)
mean_rf_score = np.mean(rf_classifier_score)