# General Overview - Machine Learning

We are using random forest, decision trees, logistic regression, and SVC to model our data.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (mean_squared_error,
                             classification_report,
                             mean_absolute_error,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.metrics import plot_precision_recall_curve # OPTIONAL

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import imblearn
from imblearn.under_sampling import (RandomUnderSampler, 
                                     ClusterCentroids,
                                     TomekLinks,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

# Machine Learning Models

Description goes here.

## Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [58]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


## Baseline - DummyClassifier

We start by using the DummyClassifier to make predictions using simple rules.

In [7]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6812251068630235


In [8]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6811775269172032


## Logistic Regression

In [9]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

Accuracy Score, Training Set:  0.8106685548581708
Accuracy Score, Test Set:  0.8109697698975958


In [11]:
# cross validation - 5-fold

cv_scores = cross_val_score(logreg, X, y, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [12]:
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

[0.80927348 0.8115297  0.81109994 0.81189038 0.80661054]
Average 5-Fold CV Score: 0.8100808091660465


In [13]:
# confusion matrix

cm = confusion_matrix(y_test, y_pred_logreg)
print ("Confusion Matrix: \n", cm) ## why is the matrix 3x3?

Confusion Matrix: 
 [[   425  23678      4]
 [   416 131664      2]
 [   320   6370      5]]


In [14]:
## add labels, calculate scores

In [15]:
# classification report

print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

        Fair       0.37      0.02      0.03     24107
        Good       0.81      1.00      0.90    132082
        Poor       0.45      0.00      0.00      6695

    accuracy                           0.81    162884
   macro avg       0.54      0.34      0.31    162884
weighted avg       0.73      0.81      0.73    162884



## KNN Classifier

This one takes a long time to run.

In [16]:
# start with 15 due to large dataset
knn = KNeighborsClassifier(n_neighbors=6)

# fit to training data
knn.fit(X_train, y_train)

# predict
y_pred = knn.predict(X_test)

# accuracy scoring
print(knn.score(X_test, y_test))

0.7819736745168341


In [17]:
# accuracy scoring

print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))

Accuracy Score, Training Set:  0.7936605061690245
Accuracy Score, Test Set:  0.7819736745168341


In [18]:
## use GridSearch

knn = KNeighborsClassifier()
parameters = {'n_neighbors':[3, 10]}

gridsearch = GridSearchCV(knn, parameters)

# fit to data
gridsearch.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None, param_grid={'n_neighbors': [3, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [19]:
knn.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [20]:
# cross validation
cv_scores_knn = cross_val_score(knn, X, y, cv=5)

print('CV scores: ', cv_scores_knn)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_knn)))

CV scores:  [0.77301296 0.79154612 0.78173851 0.78559095 0.75418051]
Average 5-Fold CV Score: 0.7772138104629835


In [21]:
# confusion matrix

cm_knn = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm_knn) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  2348  21581    178]
 [  7046 124869    167]
 [   843   5698    154]]


In [22]:
# classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Fair       0.23      0.10      0.14     24107
        Good       0.82      0.95      0.88    132082
        Poor       0.31      0.02      0.04      6695

    accuracy                           0.78    162884
   macro avg       0.45      0.36      0.35    162884
weighted avg       0.71      0.78      0.73    162884



## Decision Tree Classifier

In [23]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

y_pred_decision_tree = decision_tree.predict(X_test)

In [24]:
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

Accuracy Score, Training Set: 0.8255912706614741
Accuracy Score, Test Set: 0.8010547383413963


In [25]:
# cross validation

cv_scores_dtc = cross_val_score(decision_tree, X, y, cv=5)

print('CV scores: ', cv_scores_dtc)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_dtc)))

CV scores:  [0.79194518 0.80511408 0.80618846 0.80332599 0.77837722]
Average 5-Fold CV Score: 0.7969901847176284


In [26]:
# confusion matrix

cm_decision_tree = confusion_matrix(y_test, y_pred_decision_tree)
print ("Confusion Matrix: \n", cm_decision_tree) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  1478  22367    262]
 [  2918 128799    365]
 [   516   5977    202]]


In [27]:
# classification report

print(classification_report(y_test, y_pred_decision_tree))

              precision    recall  f1-score   support

        Fair       0.30      0.06      0.10     24107
        Good       0.82      0.98      0.89    132082
        Poor       0.24      0.03      0.05      6695

    accuracy                           0.80    162884
   macro avg       0.45      0.36      0.35    162884
weighted avg       0.72      0.80      0.74    162884



## Random Forest Classifier

In [28]:
forest = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

y_pred_forest = forest.predict(X_test)

In [29]:
# accuracy score

print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

Accuracy Score, Training Set: 0.8255851313104854
Accuracy Score, Test Set: 0.8046769480120822


In [30]:
## do GridSearch for optimal parameters
## training and test scores

In [31]:
# cross validation - 5-fold

cv_scores_forest = cross_val_score(forest, X, y, cv=5)

print(cv_scores_forest)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_forest)))

[0.79773919 0.80779237 0.80796887 0.80640334 0.78619721]
Average 5-Fold CV Score: 0.8012201953847453


In [72]:
# confusion matrix
cm_forest = confusion_matrix(y_test, y_pred_forest)

print ('Confusion Matrix: \n', cm_forest) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   526  23452    129]
 [  2899 128499    684]
 [   137   6527     31]]


In [33]:
# classification report

print(classification_report(y_test, y_pred_forest))

              precision    recall  f1-score   support

        Fair       0.32      0.05      0.08     24107
        Good       0.82      0.98      0.89    132082
        Poor       0.26      0.03      0.06      6695

    accuracy                           0.80    162884
   macro avg       0.47      0.35      0.34    162884
weighted avg       0.72      0.80      0.74    162884



## Gaussian Naive Bayes

In [34]:
gaussian = GaussianNB().fit(X_train, y_train)
y_pred_gaussian = gaussian.predict(X_test)

In [35]:
# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

Accuracy: 0.7394096412170624
Accuracy Score, Training Set: 0.7382139809393616
Accuracy Score, Test Set: 0.7394096412170624


In [36]:
# cross validation - 5-fold

cv_scores_gaussian = cross_val_score(gaussian, X, y, cv=5)

print(cv_scores_gaussian)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_gaussian)))

[0.77301296 0.79154612 0.78173851 0.78559095 0.75418051]
Average 5-Fold CV Score: 0.7772138104629835


In [37]:
# confusion matrix

cm_gaussian = confusion_matrix(y_test, y_pred_gaussian)
print ('Confusion Matrix: \n', cm_gaussian) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  2822  18304   2981]
 [  9371 116416   6295]
 [   663   4832   1200]]


In [38]:
# classification report

print(classification_report(y_test, y_pred_gaussian))

              precision    recall  f1-score   support

        Fair       0.22      0.12      0.15     24107
        Good       0.83      0.88      0.86    132082
        Poor       0.11      0.18      0.14      6695

    accuracy                           0.74    162884
   macro avg       0.39      0.39      0.38    162884
weighted avg       0.71      0.74      0.72    162884



## Categorical Naive Bayes

In [39]:
from sklearn.naive_bayes import CategoricalNB

categorical = CategoricalNB().fit(X_train, y_train)

y_pred_cate = categorical.predict(X_test)

In [40]:
# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))

Accuracy: 0.800901254880774
Accuracy Score, Training Set: 0.8006921095014642
Accuracy Score, Test Set: 0.800901254880774


In [41]:
# confusion matrix

cm_categorical = confusion_matrix(y_test, y_pred_cate)
print ('Confusion Matrix: \n', cm_categorical) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  1536  22310    261]
 [  3115 128696    271]
 [   584   5889    222]]


In [42]:
# classification report

print(classification_report(y_test, y_pred_cate))

              precision    recall  f1-score   support

        Fair       0.29      0.06      0.10     24107
        Good       0.82      0.97      0.89    132082
        Poor       0.29      0.03      0.06      6695

    accuracy                           0.80    162884
   macro avg       0.47      0.36      0.35    162884
weighted avg       0.72      0.80      0.74    162884



Since the precision and recall scores are highly accurate for good trees but extremely off-the-mark for fair and poor trees, we need to look into under-sampling and over-sampling techniques in order to increase the accuracies rates.