In [1]:
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, plot_confusion_matrix, 
                             precision_score, recall_score, classification_report)
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import Normalizer, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
nyc_tree = pd.read_csv('data/nyc_tree_cleaned.csv')

In [3]:
nyc_tree.sample(7)

Unnamed: 0,health,tree_dbh,on_curb,steward,guards
511207,2,7,0,1or2,
4865,1,4,1,1or2,Helpful
131925,2,12,1,,
322828,2,2,1,1or2,
161200,2,5,1,,
474862,2,18,1,,
419090,2,12,1,,


In [4]:
y = nyc_tree.health
X = nyc_tree.drop(columns=['health'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
X_train.isna().sum()

tree_dbh    0
on_curb     0
steward     0
guards      1
dtype: int64

In [9]:
X_train.guards.fillna('None', inplace=True)

In [None]:
X_train.tree_dbh = X_train.tree_dbh.apply(lambda x: (x-X_train.tree_dbh.min()) / (X_train.tree_dbh.max()-X_train.tree_dbh.min()))

In [40]:
for value in X_train.tree_dbh[:9]:
    print((value-X_train.tree_dbh.min()) / (X_train.tree_dbh.max()-X_train.tree_dbh.min()))

0.21875
0.28125
0.4375
0.53125
0.53125
0.28125
0.375
0.84375
0.15625


In [41]:
dt_train_other = Pipeline([('one_hot_encoder', ohe_column_transformer)])
dt_train_other.fit_transform(X_train)

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [42]:
X_train.iloc[[0]]

Unnamed: 0,tree_dbh,on_curb,steward,guards
423833,7,1,,


In [43]:
X_train.steward.value_counts()

None       355534
1or2       106579
3or4        14294
4orMore      1187
Name: steward, dtype: int64

In [None]:
feature_union = FeatureUnion(transformer_list=[
    ('impute_values', impute_column_transformer),
    ('normalize_feature', nrmlz_column_transformer)
])

In [44]:
dt_train_pipeline_combo = Pipeline(steps=[ ('impute_values', impute_column_transformer),
                                          ('one_hot_encoder', ohe_column_transformer)])#,
                                          #('decison_tree', DecisionTreeClassifier(random_state=42))])

NameError: name 'feature_union' is not defined

In [36]:
dt_train_pipeline_combo

Pipeline(steps=[('feature_union',
                 FeatureUnion(transformer_list=[('impute_values',
                                                 ColumnTransformer(transformers=[('imputer',
                                                                                  SimpleImputer(strategy='most_frequent'),
                                                                                  ['guards'])])),
                                                ('normalize_feature',
                                                 ColumnTransformer(transformers=[('normalizer',
                                                                                  MinMaxScaler(),
                                                                                  ['tree_dbh'])]))])),
                ('one_hot_encoder',
                 ColumnTransformer(transformers=[('one_hot_encoder',
                                                  OneHotEncoder(sparse=False),
                                   

In [35]:
dt_train_fitted_transformed = dt_train_pipeline_combo.fit_transform(X_train)
dt_test_fitted_transformed = dt_train_pipeline_combo.fit_transform(X_test)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
dt_train_fitted_transformed[0]

In [None]:
X_train.guards.value_counts(dropna=False)

In [None]:
pd.DataFrame(dt_train_fitted_transformed)[0].value_counts(dropna=False)

In [None]:
dt_classifier_baseline = DecisionTreeClassifier(random_state=42)
dt_classifier_baseline.fit(dt_train_fitted_transformed, y_train)

In [None]:
dt_baseline_predictions = dt_classifier_baseline.predict(dt_test_fitted_transformed)

In [None]:
# dt_pipeline.score(X_test, y_test)

In [None]:
import collections
collections.Counter(dt_baseline_predictions)

In [None]:
#X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [None]:
def score_retreiver(true_results, predictions):
    precision = precision_score(true_results, predictions, average='weighted') * 100
    recall = recall_score(true_results, predictions, average='weighted') * 100
    accuracy = accuracy_score(true_results, predictions) * 100
    f1 = f1_score(true_results, predictions, average='weighted') * 100
    return precision, recall, accuracy, f1

### Decision Tree

In [None]:
# dt_classifier = DecisionTreeClassifier(random_state=42)
# dt_classifier_baseline = dt_classifier
# dt_classifier_score = cross_val_score(dt_classifier_baseline, X_train, y_train, cv=5)
# mean_dt_score = np.mean(dt_classifier_score)

In [None]:
print('Test Scores')
print(f'Decision Tree Baseline Precision: {score_retreiver(y_test, dt_baseline_predictions)[0]}')
print(f'Decision Tree Baseline Recall: {score_retreiver(y_test, dt_baseline_predictions)[1]}')
print(f'Decision Tree Baseline Accuracy: {score_retreiver(y_test, dt_baseline_predictions)[2]}')
print(f'Decision Tree Baseline F1: {score_retreiver(y_test, dt_baseline_predictions)[3]}')

In [None]:
print(classification_report(y_test, dt_baseline_predictions))

In [None]:
path = DecisionTreeClassifier(random_state=42).cost_complexity_pruning_path(dt_train_fitted_transformed, y_train)
## Gather all alphas except for the last one which would return only the root node
ccp_alphas = path.ccp_alphas[:-1]

In [None]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': list(range(1,7)),
    #'ccp_alpha': list(ccp_alphas)
}

In [None]:
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search_results = dt_grid_search.fit(dt_train_fitted_transformed, y_train)

In [None]:
12dt_grid_search.best_params_

In [None]:
dt_classifier_best_parameters = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=4)

In [None]:
dt_classifier_best_parameters.fit(X_train, y_train)
dt_best_parameters_predictions = dt_classifier_best_parameters.predict(X_test)
print('Test Scores')
print(f'Decision Tree Tuned Precision: {score_retreiver(y_test, dt_best_parameters_predictions)[0]}')
print(f'Decision Tree Tuned Recall: {score_retreiver(y_test, dt_best_parameters_predictions)[1]}')
print(f'Decision Tree Tuned Accuracy: {score_retreiver(y_test, dt_baseline_predictions)[2]}')
print(f'Decision Tree Tuned F1: {score_retreiver(y_test, dt_baseline_predictions)[3]}')

In [None]:
print(classification_report(y_test, dt_best_parameters_predictions))

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(dt_classifier_best_parameters)
plt.show()

### Random Forest 

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier_baseline = rf_classifier
rf_classifier_score = cross_val_score(rf_classifier_baseline, X_train, y_train, cv=5)
mean_rf_score = np.mean(rf_classifier_score)