# General Overview - Machine Learning

We are using random forest, decision trees, logistic regression, and SVC to model our data.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (mean_squared_error,
                             classification_report,
                             mean_absolute_error,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.metrics import plot_precision_recall_curve ## WORKS NOW TRY IT

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import imblearn
from imblearn.under_sampling import (RandomUnderSampler, 
                                     ClusterCentroids,
                                     TomekLinks,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_labels,num_problems,tree_dbh,root_stone_labels,root_grate_labels,root_other_labels,trunk_wire_labels,trnk_light_labels,trnk_other_labels,...,Helpful,None.1,Unsure,Damage,NoDamage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0


In [4]:
tree.shape

(651535, 30)

# Machine Learning Models

Description goes here.

## Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_labels') # keep the categorical column

In [6]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=23456789, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 28) (488651,)
(162884, 28) (162884,)


## Baseline - DummyClassifier

We start by using the DummyClassifier to make predictions using simple rules.

In [7]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6806311249587512


In [8]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6812957093632729


## Logistic Regression

In [9]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

Accuracy Score, Training Set:  0.8106685548581708
Accuracy Score, Test Set:  0.8110250239434199


In [11]:
## rebalance data - results consistently horrible

## oversampling or undersampling to obtain better results

In [12]:
# cross validation - 5-fold

cv_scores = cross_val_score(logreg, X, y, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [13]:
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

[0.80918907 0.81161411 0.81119971 0.81205921 0.80682542]
Average 5-Fold CV Score: 0.8101775038946488


In [14]:
# confusion matrix

cm = confusion_matrix(y_test, y_pred_logreg)
print ("Confusion Matrix: \n", cm) ## why is the matrix 3x3?

Confusion Matrix: 
 [[   413  23688      6]
 [   395 131682      5]
 [   297   6390      8]]


In [15]:
## add labels, calculate scores

In [16]:
# classification report

print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

        Fair       0.37      0.02      0.03     24107
        Good       0.81      1.00      0.90    132082
        Poor       0.42      0.00      0.00      6695

    accuracy                           0.81    162884
   macro avg       0.54      0.34      0.31    162884
weighted avg       0.73      0.81      0.73    162884



## KNN Classifier

This one takes a long time to run.

In [68]:
# start with 15 due to large dataset
knn = KNeighborsClassifier(n_neighbors=15)

# fit to training data
knn.fit(X_train, y_train)

# predict
y_pred = knn.predict(X_test)

# accuracy scoring
print(knn.score(X_test, y_test))

0.8089867635863559


In [69]:
# accuracy scoring

print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))

Accuracy Score, Training Set:  0.8131713636112481
Accuracy Score, Test Set:  0.8089867635863559


In [70]:
## use GridSearch

knn = KNeighborsClassifier()
parameters = {'n_neighbors':[10, 20]}

gridsearch = GridSearchCV(knn, parameters)

# fit to data
gridsearch.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [10, 20]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [71]:
knn.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [72]:
sorted(gridsearch.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_n_neighbors',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [73]:
sorted(gridsearch.cv_results_)

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_n_neighbors',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [74]:
# cross validation
cv_scores_knn = cross_val_score(knn, X, y, cv=5)

print('CV scores: ', cv_scores_knn)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_knn)))

CV scores:  [0.77355783 0.79189913 0.78118597 0.78547584 0.75378145]
Average 5-Fold CV Score: 0.7771800440498208


In [75]:
# confusion matrix

cm_knn = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm_knn) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   822  23178    107]
 [  1173 130810     99]
 [   375   6181    139]]


In [76]:
# classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Fair       0.35      0.03      0.06     24107
        Good       0.82      0.99      0.90    132082
        Poor       0.40      0.02      0.04      6695

    accuracy                           0.81    162884
   macro avg       0.52      0.35      0.33    162884
weighted avg       0.73      0.81      0.74    162884



## Decision Tree Classifier

In [25]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

y_pred_decision_tree = decision_tree.predict(X_test)

In [26]:
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

Accuracy Score, Training Set: 0.8255912706614741
Accuracy Score, Test Set: 0.8010915743719457


In [27]:
# cross validation

cv_scores_dtc = cross_val_score(decision_tree, X, y, cv=5)

print('CV scores: ', cv_scores_dtc)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_dtc)))

CV scores:  [0.79180704 0.80516012 0.80606568 0.80344878 0.77818536]
Average 5-Fold CV Score: 0.7969333957500364


In [28]:
# confusion matrix

cm_decision_tree = confusion_matrix(y_test, y_pred_decision_tree)
print ("Confusion Matrix: \n", cm_decision_tree) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  1471  22368    268]
 [  2915 128806    361]
 [   508   5979    208]]


In [29]:
# classification report

print(classification_report(y_test, y_pred_decision_tree))

              precision    recall  f1-score   support

        Fair       0.30      0.06      0.10     24107
        Good       0.82      0.98      0.89    132082
        Poor       0.25      0.03      0.06      6695

    accuracy                           0.80    162884
   macro avg       0.46      0.36      0.35    162884
weighted avg       0.72      0.80      0.74    162884



## Random Forest Classifier

In [30]:
forest = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

y_pred_forest = forest.predict(X_test)

In [31]:
# accuracy score

print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

Accuracy Score, Training Set: 0.825587177760815
Accuracy Score, Test Set: 0.804572579258859


In [32]:
## do GridSearch for optimal parameters
## training and test scores

In [33]:
# cross validation - 5-fold

cv_scores_forest = cross_val_score(forest, X, y, cv=5)

print(cv_scores_forest)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_forest)))

[0.79809987 0.80739331 0.80815305 0.80663357 0.78623558]
Average 5-Fold CV Score: 0.8013030765806901


In [34]:
# confusion matrix
cm_forest = confusion_matrix(y_test, y_pred_forest)

print ('Confusion Matrix: \n', cm_forest) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  1123  22698    286]
 [  1990 129700    392]
 [   426   6040    229]]


In [35]:
# classification report

print(classification_report(y_test, y_pred_forest))

              precision    recall  f1-score   support

        Fair       0.32      0.05      0.08     24107
        Good       0.82      0.98      0.89    132082
        Poor       0.25      0.03      0.06      6695

    accuracy                           0.80    162884
   macro avg       0.46      0.35      0.34    162884
weighted avg       0.72      0.80      0.74    162884



## Gaussian Naive Bayes

In [36]:
gaussian = GaussianNB().fit(X_train, y_train)

y_pred_gaussian = gaussian.predict(X_test)

In [37]:
# accuracy score

print('Accuracy:',metrics.accuracy_score(y_test, y_pred_gaussian))

print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

Accuracy: 0.7370828319540287
Accuracy Score, Training Set: 0.7358298663053999
Accuracy Score, Test Set: 0.7370828319540287


In [38]:
# cross validation - 5-fold

cv_scores_gaussian = cross_val_score(knn, X, y, cv=5)

print(cv_scores_gaussian)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_gaussian)))

[0.77355783 0.79189913 0.78118597 0.78547584 0.75378145]
Average 5-Fold CV Score: 0.7771800440498208


In [39]:
# confusion matrix

cm_gaussian = confusion_matrix(y_test, y_pred_gaussian)
print ('Confusion Matrix: \n', cm_gaussian) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  2780  18251   3076]
 [  9399 116048   6635]
 [   661   4803   1231]]


In [40]:
# classification report

print(classification_report(y_test, y_pred_gaussian))

              precision    recall  f1-score   support

        Fair       0.22      0.12      0.15     24107
        Good       0.83      0.88      0.86    132082
        Poor       0.11      0.18      0.14      6695

    accuracy                           0.74    162884
   macro avg       0.39      0.39      0.38    162884
weighted avg       0.71      0.74      0.72    162884



## Categorical Naive Bayes

In [41]:
from sklearn.naive_bayes import CategoricalNB

categorical = CategoricalNB().fit(X_train, y_train)

y_pred_cate = categorical.predict(X_test)

In [42]:
# accuracy score

print('Accuracy:', metrics.accuracy_score(y_test, y_pred_cate))

print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))

Accuracy: 0.8007047960511775
Accuracy Score, Training Set: 0.8005058825214724
Accuracy Score, Test Set: 0.8007047960511775


In [43]:
# confusion matrix

cm_categorical = confusion_matrix(y_test, y_pred_cate)
print ('Confusion Matrix: \n', cm_categorical) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  1550  22290    267]
 [  3153 128633    296]
 [   567   5889    239]]


In [44]:
# classification report

print(classification_report(y_test, y_pred_cate))

              precision    recall  f1-score   support

        Fair       0.29      0.06      0.11     24107
        Good       0.82      0.97      0.89    132082
        Poor       0.30      0.04      0.06      6695

    accuracy                           0.80    162884
   macro avg       0.47      0.36      0.35    162884
weighted avg       0.72      0.80      0.74    162884



Since the precision and recall scores are highly accurate for good trees but extremely off-the-mark for fair and poor trees, we need to look into under-sampling and over-sampling techniques in order to increase the accuracies rates.

# Handling Imbalanced Data: Over and Under Sampling

## Under Sampling

### Random Under-sampling

In [45]:
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X, y)

In [46]:
print('Random undersampling {}'.format(Counter(y_rs)))

Random undersampling Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})


In [47]:
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

In [48]:
# logistic regression

In [49]:
logreg_rs = LogisticRegression().fit(X_train_rs, y_train_rs)

y_pred_logreg_rs = logreg_rs.predict(X_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [50]:
print(classification_report(y_test_rs, y_pred_logreg_rs))

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24      6648
        Good       0.42      0.53      0.47      6744
        Poor       0.42      0.53      0.47      6694

    accuracy                           0.41     20086
   macro avg       0.41      0.41      0.39     20086
weighted avg       0.41      0.41      0.39     20086



Less than half of all three groups are correctly placed.

In [51]:
# KNN classifier

In [52]:
# create KNN with 15 neighbbors
knn_rs = KNeighborsClassifier(n_neighbors=15)

# fit to training data
knn_rs.fit(X_train_rs, y_train_rs)

# predict
y_pred_knn_rs = knn_rs.predict(X_test_rs)

# accuracy scoring
print(knn_rs.score(X_test_rs, y_test_rs))

0.3976401473663248


In [53]:
print(classification_report(y_test_rs, y_pred_knn_rs))

              precision    recall  f1-score   support

        Fair       0.35      0.38      0.37      6648
        Good       0.42      0.41      0.41      6744
        Poor       0.42      0.41      0.41      6694

    accuracy                           0.40     20086
   macro avg       0.40      0.40      0.40     20086
weighted avg       0.40      0.40      0.40     20086



In [54]:
# decision tree classifier

In [55]:
decision_tree_rs = DecisionTreeClassifier(random_state=42).fit(X_train_rs, y_train_rs)

y_pred_decision_tree_rs = decision_tree_rs.predict(X_test_rs)

In [56]:
# accuracy score

print(decision_tree_rs.score(X_test_rs, y_test_rs))
print('Accuracy Score, Training Set:', decision_tree_rs.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', decision_tree_rs.score(X_test_rs, y_test_rs))

0.40022901523449167
Accuracy Score, Training Set: 0.543107024909969
Accuracy Score, Test Set: 0.40022901523449167


In [57]:
print(classification_report(y_test_rs, y_pred_decision_tree_rs))

              precision    recall  f1-score   support

        Fair       0.37      0.31      0.33      6648
        Good       0.40      0.50      0.45      6744
        Poor       0.43      0.39      0.41      6694

    accuracy                           0.40     20086
   macro avg       0.40      0.40      0.40     20086
weighted avg       0.40      0.40      0.40     20086



### ClusterCentroids -  this one takes a long time

In [58]:
# cluster = ClusterCentroids(random_state=42)
# X_rs, y_rs = cluster.fit_sample(X, y)
# print('Cluster centriods undersampling {}'.format(Counter(y_rs)))

### TomekLinks

In [59]:
tomek = TomekLinks()
X_rs, y_rs = tomek.fit_sample(X, y)
print('TomekLinks undersampling {}'.format(Counter(y_rs)))

TomekLinks undersampling Counter({'Good': 527601, 'Fair': 95768, 'Poor': 26781})


In [60]:
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

In [61]:
# logistic regression

logreg_rs = LogisticRegression().fit(X_train_rs, y_train_rs)

y_pred_logreg_rs = logreg_rs.predict(X_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [62]:
print(classification_report(y_test_rs, y_pred_logreg_rs))

              precision    recall  f1-score   support

        Fair       0.34      0.02      0.03     23788
        Good       0.81      1.00      0.90    131921
        Poor       0.64      0.00      0.00      6829

    accuracy                           0.81    162538
   macro avg       0.60      0.34      0.31    162538
weighted avg       0.74      0.81      0.73    162538



In [63]:
# decision tree classifier

decision_tree_rs = DecisionTreeClassifier(random_state=42).fit(X_train_rs, y_train_rs)

y_pred_decision_tree_rs = decision_tree_rs.predict(X_test_rs)

In [64]:
print(classification_report(y_test_rs, y_pred_decision_tree_rs))

              precision    recall  f1-score   support

        Fair       0.29      0.06      0.10     23788
        Good       0.82      0.98      0.89    131921
        Poor       0.25      0.03      0.06      6829

    accuracy                           0.80    162538
   macro avg       0.46      0.36      0.35    162538
weighted avg       0.72      0.80      0.74    162538



In [65]:
# gaussian naive bayes

gaussian_rs = GaussianNB().fit(X_train_rs, y_train_rs)

y_pred_gaussian_rs = gaussian_rs.predict(X_test_rs)

In [66]:
# accuracy score

print('Accuracy Score, Training Set:', gaussian_rs.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', gaussian_rs.score(X_test_rs, y_test_rs))

Accuracy Score, Training Set: 0.7341041647867567
Accuracy Score, Test Set: 0.7328932311213378


In [67]:
print(classification_report(y_test_rs, y_pred_gaussian_rs))

              precision    recall  f1-score   support

        Fair       0.20      0.11      0.15     23788
        Good       0.83      0.87      0.85    131921
        Poor       0.12      0.19      0.15      6829

    accuracy                           0.73    162538
   macro avg       0.38      0.39      0.38    162538
weighted avg       0.71      0.73      0.72    162538



## Over Sampling

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_sample(X_train, y_train)