# Random Over Sampling Methods

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (mean_squared_error,
                             classification_report,
                             mean_absolute_error,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.metrics import plot_precision_recall_curve ## WORKS NOW TRY IT

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB

import imblearn
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

# Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [6]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


# Baseline - DummyClassifier

In [7]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6814200311571903


In [8]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.681100785069106


# Random Over Sampler

In [9]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_ros))

Resampled dataset shape Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})


In [10]:
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

## Logistic Regression

In [11]:
logreg_rs = LogisticRegression().fit(X_train_rs, y_train_rs)
y_pred_logreg_rs = logreg_rs.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg_rs.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set: ', logreg_rs.score(X_test_rs, y_test_rs))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.4181268323049292
Accuracy Score, Test Set:  0.41764207083478444


In [12]:
print(classification_report(y_test_rs, y_pred_logreg_rs))

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24    132500
        Good       0.41      0.55      0.47    131527
        Poor       0.43      0.54      0.48    132219

    accuracy                           0.42    396246
   macro avg       0.41      0.42      0.40    396246
weighted avg       0.41      0.42      0.39    396246



## KNN Classifier

In [13]:
# using 6 neighbors
knn_rs = KNeighborsClassifier(n_neighbors=6)
knn_rs.fit(X_train_rs, y_train_rs)
y_pred_knn_rs = knn_rs.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set: ', knn_rs.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set: ', knn_rs.score(X_test_rs, y_test_rs))

Accuracy Score, Training Set:  0.45059243649762143
Accuracy Score, Test Set:  0.4412435709130187


In [14]:
print(classification_report(y_test_rs, y_pred_knn_rs))

              precision    recall  f1-score   support

        Fair       0.41      0.48      0.44    132500
        Good       0.43      0.39      0.41    131527
        Poor       0.50      0.45      0.47    132219

    accuracy                           0.44    396246
   macro avg       0.44      0.44      0.44    396246
weighted avg       0.44      0.44      0.44    396246



## Decision Tree Classifier

In [16]:
decision_tree_ros = DecisionTreeClassifier(random_state=42).fit(X_train_rs, y_train_rs)
y_pred_decision_tree_ros = decision_tree_ros.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree_ros.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', decision_tree_ros.score(X_test_rs, y_test_rs))

Accuracy Score, Training Set: 0.5073813760005383
Accuracy Score, Test Set: 0.4940945776108781


In [18]:
print(classification_report(y_test_rs, y_pred_decision_tree_ros))

              precision    recall  f1-score   support

        Fair       0.53      0.32      0.40    132500
        Good       0.46      0.58      0.51    131527
        Poor       0.52      0.58      0.55    132219

    accuracy                           0.49    396246
   macro avg       0.50      0.49      0.49    396246
weighted avg       0.50      0.49      0.49    396246



## Random Forest Classifier

In [20]:
rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_rs, y_train_rs)
y_pred_forest_rs = rf.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', rf.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', rf.score(X_test_rs, y_test_rs))

Accuracy Score, Training Set: 0.5073645513928672
Accuracy Score, Test Set: 0.49455893561070646


In [21]:
print(classification_report(y_test_rs, y_pred_forest_rs))

              precision    recall  f1-score   support

        Fair       0.53      0.32      0.40    132500
        Good       0.46      0.59      0.51    131527
        Poor       0.52      0.58      0.55    132219

    accuracy                           0.49    396246
   macro avg       0.50      0.49      0.49    396246
weighted avg       0.50      0.49      0.49    396246



## Gaussian Naive Bayes

In [22]:
gaussian = GaussianNB().fit(X_train_rs, y_train_rs)
y_pred_g = gaussian.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', gaussian.score(X_test_rs, y_test_rs))

Accuracy Score, Training Set: 0.39208654578186053
Accuracy Score, Test Set: 0.391463888594459


In [23]:
print(classification_report(y_test_rs, y_pred_g))

              precision    recall  f1-score   support

        Fair       0.40      0.06      0.11    132500
        Good       0.37      0.87      0.52    131527
        Poor       0.47      0.25      0.33    132219

    accuracy                           0.39    396246
   macro avg       0.42      0.39      0.32    396246
weighted avg       0.42      0.39      0.32    396246



## Categorical Naive Bayes

In [24]:
categorical = CategoricalNB()
categorical.fit(X_train_rs, y_train_rs)
y_pred_cnb = categorical.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', categorical.score(X_test_rs, y_test_rs))

Accuracy Score, Training Set: 0.4157840056867174
Accuracy Score, Test Set: 0.4148685412597225


In [25]:
print(classification_report(y_test_rs, y_pred_cnb))

              precision    recall  f1-score   support

        Fair       0.39      0.27      0.32    132500
        Good       0.40      0.62      0.48    131527
        Poor       0.47      0.36      0.41    132219

    accuracy                           0.41    396246
   macro avg       0.42      0.42      0.40    396246
weighted avg       0.42      0.41      0.40    396246



# SMOTE - Synthetic Minority Over-sampling Technique

Documentation for SMOTE is found here: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html

In [26]:
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_sm))

Resampled dataset shape Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})


In [27]:
# train test split
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

## Logistic Regression

In [28]:
logreg = LogisticRegression().fit(X_train_sm, y_train_sm)
y_pred_lr = logreg.predict(X_test_sm)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_sm, y_train_sm))
print('Accuracy Score, Test Set: ', logreg.score(X_test_sm, y_test_sm))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.4214404387857681
Accuracy Score, Test Set:  0.4209935242248502


In [29]:
print(classification_report(y_test_sm, y_pred_lr))

              precision    recall  f1-score   support

        Fair       0.41      0.18      0.25    132500
        Good       0.42      0.52      0.46    131527
        Poor       0.43      0.56      0.49    132219

    accuracy                           0.42    396246
   macro avg       0.42      0.42      0.40    396246
weighted avg       0.42      0.42      0.40    396246



## KNN Classifier

In [31]:
# using 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_sm, y_train_sm)
y_pred_knn = knn.predict(X_test_sm)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_sm, y_train_sm))
print('Accuracy Score, Test Set: ', knn.score(X_test_sm, y_test_sm))

Accuracy Score, Training Set:  0.4488923098924487
Accuracy Score, Test Set:  0.43925743098983966


In [32]:
print(classification_report(y_test_sm, y_pred_knn))

              precision    recall  f1-score   support

        Fair       0.40      0.50      0.45    132500
        Good       0.44      0.37      0.40    131527
        Poor       0.49      0.45      0.47    132219

    accuracy                           0.44    396246
   macro avg       0.44      0.44      0.44    396246
weighted avg       0.44      0.44      0.44    396246



## Decision Tree Classifier

In [33]:
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_sm, y_train_sm)
y_pred_dtree = dtree.predict(X_test_sm)

# accuracy score
print('Accuracy Score, Training Set:', dtree.score(X_train_sm, y_train_sm))
print('Accuracy Score, Test Set:', dtree.score(X_test_sm, y_test_sm))

Accuracy Score, Training Set: 0.5036522017102214
Accuracy Score, Test Set: 0.4879594999066237


In [34]:
print(classification_report(y_test_sm, y_pred_dtree))

              precision    recall  f1-score   support

        Fair       0.51      0.32      0.40    132500
        Good       0.45      0.56      0.50    131527
        Poor       0.51      0.58      0.54    132219

    accuracy                           0.49    396246
   macro avg       0.49      0.49      0.48    396246
weighted avg       0.49      0.49      0.48    396246



## Random Forest Classifier

In [35]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_sm, y_train_sm)
y_pred_rf = rf.predict(X_test_sm)

# accuracy score
print('Accuracy Score, Training Set:', rf.score(X_train_sm, y_train_sm))
print('Accuracy Score, Test Set:', rf.score(X_test_sm, y_test_sm))

Accuracy Score, Training Set: 0.5036387420240844
Accuracy Score, Test Set: 0.4890875870040329


In [36]:
print(classification_report(y_test_sm, y_pred_rf))

              precision    recall  f1-score   support

        Fair       0.52      0.32      0.39    132500
        Good       0.46      0.57      0.51    131527
        Poor       0.51      0.58      0.54    132219

    accuracy                           0.49    396246
   macro avg       0.49      0.49      0.48    396246
weighted avg       0.49      0.49      0.48    396246



## Gaussian Naive Bayes

In [37]:
gaussian = GaussianNB()
gaussian.fit(X_train_sm, y_train_sm)
y_pred_gaussian = gaussian.predict(X_test_sm)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_sm, y_train_sm))
print('Accuracy Score, Test Set:', gaussian.score(X_test_sm, y_test_sm))

Accuracy Score, Training Set: 0.370339899136477
Accuracy Score, Test Set: 0.37038354961311915


In [38]:
print(classification_report(y_test_sm, y_pred_gaussian))

              precision    recall  f1-score   support

        Fair       0.39      0.09      0.15    132500
        Good       0.51      0.13      0.20    131527
        Poor       0.35      0.89      0.51    132219

    accuracy                           0.37    396246
   macro avg       0.42      0.37      0.29    396246
weighted avg       0.42      0.37      0.29    396246



## Categorical Naive Bayes

In [39]:
categorical = CategoricalNB()
categorical.fit(X_train_sm, y_train_sm)
y_pred_cnb = categorical.predict(X_test_sm)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_sm, y_train_sm))
print('Accuracy Score, Test Set:', categorical.score(X_test_sm, y_test_sm))

Accuracy Score, Training Set: 0.4184473410810652
Accuracy Score, Test Set: 0.41731651549794824


In [40]:
print(classification_report(y_test_sm, y_pred_cnb))

              precision    recall  f1-score   support

        Fair       0.39      0.28      0.33    132500
        Good       0.40      0.56      0.47    131527
        Poor       0.46      0.41      0.43    132219

    accuracy                           0.42    396246
   macro avg       0.42      0.42      0.41    396246
weighted avg       0.42      0.42      0.41    396246



# ADASYN - Adaptive Synthetic

In [41]:
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_ada))

Resampled dataset shape Counter({'Poor': 535394, 'Good': 528327, 'Fair': 526527})


In [42]:
# train test split

X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

## Logistic Regression

In [43]:
logreg = LogisticRegression().fit(X_train_ada, y_train_ada)
y_pred_lr = logreg.predict(X_test_ada)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_ada, y_train_ada))
print('Accuracy Score, Test Set: ', logreg.score(X_test_ada, y_test_ada))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.41042487293386526
Accuracy Score, Test Set:  0.4095361226676594


In [44]:
print(classification_report(y_test_ada, y_pred_lr))

              precision    recall  f1-score   support

        Fair       0.39      0.13      0.19    132088
        Good       0.40      0.53      0.46    131397
        Poor       0.42      0.57      0.48    134077

    accuracy                           0.41    397562
   macro avg       0.40      0.41      0.38    397562
weighted avg       0.40      0.41      0.38    397562



## KNN Classifier

In [46]:
# using 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_ada, y_train_ada)
y_pred_knn = knn.predict(X_test_ada)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_ada, y_train_ada))
print('Accuracy Score, Test Set: ', knn.score(X_test_ada, y_test_ada))

Accuracy Score, Training Set:  0.4271744616772562
Accuracy Score, Test Set:  0.4178895367263471


In [55]:
print(classification_report(y_test_ada, y_pred_knn))

              precision    recall  f1-score   support

        Fair       0.38      0.47      0.42    132088
        Good       0.41      0.37      0.39    131397
        Poor       0.47      0.41      0.44    134077

    accuracy                           0.42    397562
   macro avg       0.42      0.42      0.42    397562
weighted avg       0.42      0.42      0.42    397562



## Decision Tree Classifier

In [47]:
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_ada, y_train_ada)
y_pred_dtree = dtree.predict(X_test_ada)

# accuracy score
print('Accuracy Score, Training Set:', dtree.score(X_train_ada, y_train_ada))
print('Accuracy Score, Test Set:', dtree.score(X_test_ada, y_test_ada))

Accuracy Score, Training Set: 0.49205826177216805
Accuracy Score, Test Set: 0.47872533089178543


In [48]:
print(classification_report(y_test_ada, y_pred_dtree))

              precision    recall  f1-score   support

        Fair       0.46      0.39      0.42    132088
        Good       0.47      0.46      0.46    131397
        Poor       0.50      0.59      0.54    134077

    accuracy                           0.48    397562
   macro avg       0.48      0.48      0.48    397562
weighted avg       0.48      0.48      0.48    397562



## Random Forest Classifier

In [49]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_ada, y_train_ada)
y_pred_rf = rf.predict(X_test_ada)

# accuracy score
print('Accuracy Score, Training Set:', rf.score(X_train_ada, y_train_ada))
print('Accuracy Score, Test Set:', rf.score(X_test_ada, y_test_ada))

Accuracy Score, Training Set: 0.4920448466738102
Accuracy Score, Test Set: 0.48007606360768884


In [50]:
print(classification_report(y_test_ada, y_pred_rf))

              precision    recall  f1-score   support

        Fair       0.46      0.39      0.42    132088
        Good       0.47      0.47      0.47    131397
        Poor       0.50      0.59      0.54    134077

    accuracy                           0.48    397562
   macro avg       0.48      0.48      0.48    397562
weighted avg       0.48      0.48      0.48    397562



## Gaussian Naive Bayes

In [51]:
gaussian = GaussianNB()
gaussian.fit(X_train_ada, y_train_ada)
y_pred_gaussian = gaussian.predict(X_test_ada)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_ada, y_train_ada))
print('Accuracy Score, Test Set:', gaussian.score(X_test_ada, y_test_ada))

Accuracy Score, Training Set: 0.3664409576367963
Accuracy Score, Test Set: 0.3675200346109538


In [52]:
print(classification_report(y_test_ada, y_pred_gaussian))

              precision    recall  f1-score   support

        Fair       0.38      0.11      0.17    132088
        Good       0.49      0.09      0.15    131397
        Poor       0.36      0.90      0.51    134077

    accuracy                           0.37    397562
   macro avg       0.41      0.36      0.28    397562
weighted avg       0.41      0.37      0.28    397562



## Categorical Naive Bayes

In [53]:
categorical = CategoricalNB()
categorical.fit(X_train_ada, y_train_ada)
y_pred_cnb = categorical.predict(X_test_ada)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_ada, y_train_ada))
print('Accuracy Score, Test Set:', categorical.score(X_test_ada, y_test_ada))

Accuracy Score, Training Set: 0.40931812731934475
Accuracy Score, Test Set: 0.4105598623610908


In [54]:
print(classification_report(y_test_ada, y_pred_cnb))

              precision    recall  f1-score   support

        Fair       0.39      0.25      0.30    132088
        Good       0.40      0.49      0.44    131397
        Poor       0.43      0.49      0.46    134077

    accuracy                           0.41    397562
   macro avg       0.41      0.41      0.40    397562
weighted avg       0.41      0.41      0.40    397562

