## Capstone  - NYC Tree Census - Hyperparameter Tuning 

### Table of contents
1. [Background](#Background)
     -   1.1 [Data Source](#Data-Source)
     -   1.2 [Objective](#Objective)
     

## 1. Loading and Preparing Data

#### 1.1 Load Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (cross_val_score, train_test_split)

*Successfully loaded all the required libraries*

#### 1.2 Load the cleaned data 

In [2]:
df = pd.read_csv('encoded_data_health.csv')

*Successfully loaded the cleaned and encoded file*

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,tree_dbh,health,latitude,longitude,x_sp,y_sp,problem_count,curb_loc_OffsetFromCurb,curb_loc_OnCurb,...,trnk_light_No,trnk_light_Yes,trnk_other_No,trnk_other_Yes,brch_light_No,brch_light_Yes,brch_shoe_No,brch_shoe_Yes,brch_other_No,brch_other_Yes
0,0,3,Fair,40.723092,-73.844215,1027431.148,202756.7687,0,0,1,...,1,0,1,0,1,0,1,0,1,0
1,1,21,Fair,40.794111,-73.818679,1034455.701,228644.8374,1,0,1,...,1,0,1,0,1,0,1,0,1,0


In [4]:
df = df.drop('Unnamed: 0', 1)

  df = df.drop('Unnamed: 0', 1)


In [5]:
df.head(2)

Unnamed: 0,tree_dbh,health,latitude,longitude,x_sp,y_sp,problem_count,curb_loc_OffsetFromCurb,curb_loc_OnCurb,steward_1or2,...,trnk_light_No,trnk_light_Yes,trnk_other_No,trnk_other_Yes,brch_light_No,brch_light_Yes,brch_shoe_No,brch_shoe_Yes,brch_other_No,brch_other_Yes
0,3,Fair,40.723092,-73.844215,1027431.148,202756.7687,0,0,1,0,...,1,0,1,0,1,0,1,0,1,0
1,21,Fair,40.794111,-73.818679,1034455.701,228644.8374,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0


In [6]:
df_s = df.sample(frac=0.10)

In [7]:
df_s.shape

(64239, 37)

## setting variables

In [8]:
# setting X and y variables
y = df_s['health'].values
X = df_s.drop('health', axis=1).values

## Random oversampling using imblearn

In [9]:
#import library
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)

# fit predictor and target varaible
X_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({'Good': 52091, 'Fair': 9460, 'Poor': 2688})
Resample dataset shape Counter({'Poor': 52091, 'Good': 52091, 'Fair': 52091})


In [10]:
# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

(117204, 36) (117204,)
(39069, 36) (39069,)


#### 4.1.1d Random Forest 

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

rf_rs = RandomForestClassifier(random_state=42)
rf_rs.fit(X_train_rs, y_train_rs)
y_pred_rs = rf_rs.predict(X_test_rs)
    
    
# accuracy scores
print('Training Set Accuracy Score: ', rf_rs.score(X_train_rs, y_train_rs))
print('Test Set Accuracy Score: ', rf_rs.score(X_test_rs, y_test_rs))
    
# classification report
print('Classification Metrics \n')
print(classification_report(y_test_rs, y_pred_rs))

Training Set Accuracy Score:  0.9999914678679909
Test Set Accuracy Score:  0.9592771762778673
Classification Metrics 

              precision    recall  f1-score   support

        Fair       0.91      0.99      0.95     13085
        Good       0.99      0.89      0.94     12899
        Poor       0.98      1.00      0.99     13085

    accuracy                           0.96     39069
   macro avg       0.96      0.96      0.96     39069
weighted avg       0.96      0.96      0.96     39069



## Cross Validation 

In [13]:
#Calculating accuracy score for 10 cross validation folds. 
from sklearn.model_selection import cross_val_score
ac_cv = cross_val_score(estimator=rf_rs, X = X_train_rs, y = y_train_rs, cv=10)
print("scores for each fold")
for val in ac_cv:
    print(val)

scores for each fold
0.9519754768392371
0.948058583106267
0.9533378746594006
0.9534230245231607
0.9550408719346049
0.9518051771117166
0.9503534020267393
0.9506088733713702
0.955803457378864
0.9543557864259559


## Random Search 

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [19]:
# Use the random grid to search for best hyperparameters

rf_random = RandomizedSearchCV(estimator = rf_rs, param_distributions = random_grid, cv = 5, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train_rs, y_train_rs)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [20]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 80,
 'bootstrap': False}

In [21]:
rf_random.best_score_

0.9422036913381371

##  Grid SearchCV

In [14]:
# defining parameters 
params = {'n_estimators': [1000],
 'min_samples_split': [10],
 'min_samples_leaf': [1],
 'max_features': ['auto'],
 'max_depth': [80],
 'bootstrap': [False]}

In [15]:
#Finding the best accuracy score and the best hyperparameter that gives the best result.
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(estimator = rf_rs, param_grid = params, cv=5)
clf.fit(X_train_rs, y_train_rs)

print(clf.score(X_train_rs, y_train_rs))
print(clf.best_params_)
print(clf.best_score_)

0.9980461417699055
{'bootstrap': False, 'max_depth': 80, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 1000}
0.9431077625286228
