# Over Sampling

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB

import imblearn
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

# Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [6]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


# Baseline - DummyClassifier

In [7]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6816502567014819


In [8]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6807308893612776


# Algorithm Functions

In [9]:
# logistic regression

def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [10]:
# KNN classifier

def knn(X_train, X_test, y_train, y_test):
    # using 6 neighbors
    knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print('KNN Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classificatin report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [11]:
# decision tree classifier

def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [12]:
# random forest classifier

def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [13]:
# Gaussian naive bayes

def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB().fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [14]:
# categorical naive bayes

def categorical_naive_bayes(X_train, X_test, y_train, y_test):
    categorical = CategoricalNB().fit(X_train, y_train)
    y_pred = categorical.predict(X_test)
    
    print('Categorical Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
    print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [15]:
# cross validation - 5-fold - for later
# cv_scores = cross_val_score(logreg, X, y, cv=5)
# print('CV Scores: {}'.format(cv_scores))
# print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))
# print()

# Random Over Sampler

In [16]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_ros))

Resampled dataset shape Counter({'Good': 396245, 'Fair': 396245, 'Poor': 396245})


In [17]:
# train test split
# X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

## Logistic Regression

In [18]:
logreg(X_ros, X_test, y_ros, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4189007642578035
Accuracy Score, Test Set:  0.4878195525650156

Confusion Matrix: 
 [[ 4117 10414  9576]
 [17498 71776 42808]
 [  904  2226  3565]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.17      0.18     24107
        Good       0.85      0.54      0.66    132082
        Poor       0.06      0.53      0.11      6695

    accuracy                           0.49    162884
   macro avg       0.37      0.42      0.32    162884
weighted avg       0.72      0.49      0.57    162884



## KNN Classifier

In [19]:
knn(X_ros, X_test, y_ros, y_test)

KNN Classifier 

Accuracy Score, Training Set:  0.4456199237004042
Accuracy Score, Test Set:  0.44215515336067385

Confusion Matrix: 
 [[11179  9316  3612]
 [58072 59530 14480]
 [ 2991  2393  1311]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.15      0.46      0.23     24107
        Good       0.84      0.45      0.59    132082
        Poor       0.07      0.20      0.10      6695

    accuracy                           0.44    162884
   macro avg       0.35      0.37      0.31    162884
weighted avg       0.70      0.44      0.51    162884



## Decision Tree Classifier

In [20]:
decision_tree(X_ros, X_test, y_ros, y_test)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5155758011667866
Accuracy Score, Test Set: 0.5152316986321553
Confusion Matrix: 
 [[ 6061 10715  7331]
 [24240 75149 32693]
 [ 1538  2444  2713]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.25      0.22     24107
        Good       0.85      0.57      0.68    132082
        Poor       0.06      0.41      0.11      6695

    accuracy                           0.52    162884
   macro avg       0.37      0.41      0.34    162884
weighted avg       0.72      0.52      0.59    162884



## Random Forest Classifier

In [21]:
random_forest(X_ros, X_test, y_ros, y_test)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5155505642552798
Accuracy Score, Test Set: 0.5160727879963655
Confusion Matrix: 
 [[ 5995 10772  7340]
 [23613 75324 33145]
 [ 1509  2445  2741]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.25      0.22     24107
        Good       0.85      0.57      0.68    132082
        Poor       0.06      0.41      0.11      6695

    accuracy                           0.52    162884
   macro avg       0.37      0.41      0.34    162884
weighted avg       0.72      0.52      0.59    162884



## Gaussian Naive Bayes

In [22]:
gaussian(X_ros, X_test, y_ros, y_test)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.39171051580040966
Accuracy Score, Test Set: 0.7216792318459763
Confusion Matrix: 
 [[  1145  18142   4820]
 [  4922 114657  12503]
 [   230   4717   1748]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.05      0.08     24107
        Good       0.83      0.87      0.85    132082
        Poor       0.09      0.26      0.14      6695

    accuracy                           0.72    162884
   macro avg       0.37      0.39      0.35    162884
weighted avg       0.71      0.72      0.71    162884



## Categorical Naive Bayes

In [23]:
categorical_naive_bayes(X_ros, X_test, y_ros, y_test)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4151657013548015
Accuracy Score, Test Set: 0.5525834336091943
Confusion Matrix: 
 [[ 6533 11698  5876]
 [29881 81087 21114]
 [ 1416  2892  2387]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.27      0.21     24107
        Good       0.85      0.61      0.71    132082
        Poor       0.08      0.36      0.13      6695

    accuracy                           0.55    162884
   macro avg       0.37      0.41      0.35    162884
weighted avg       0.72      0.55      0.61    162884



# SMOTE - Synthetic Minority Over-sampling Technique

Documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html

In [24]:
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_sm))

Resampled dataset shape Counter({'Good': 396245, 'Fair': 396245, 'Poor': 396245})


In [25]:
# train test split
# X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

## Logistic Regression

In [26]:
logreg(X_sm, X_test, y_sm, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4246093536406348
Accuracy Score, Test Set:  0.47189410869084747

Confusion Matrix: 
 [[ 4927 10137  9043]
 [20547 68597 42938]
 [ 1159  2196  3340]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.20      0.19     24107
        Good       0.85      0.52      0.64    132082
        Poor       0.06      0.50      0.11      6695

    accuracy                           0.47    162884
   macro avg       0.36      0.41      0.32    162884
weighted avg       0.72      0.47      0.56    162884



## KNN Classifier

In [27]:
knn(X_sm, X_test, y_sm, y_test)

KNN Classifier 

Accuracy Score, Training Set:  0.4400568671739286
Accuracy Score, Test Set:  0.4453169126494929

Confusion Matrix: 
 [[11462  9397  3248]
 [59605 59884 12593]
 [ 3090  2416  1189]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.15      0.48      0.23     24107
        Good       0.84      0.45      0.59    132082
        Poor       0.07      0.18      0.10      6695

    accuracy                           0.45    162884
   macro avg       0.35      0.37      0.31    162884
weighted avg       0.70      0.45      0.52    162884



## Decision Tree Classifier

In [28]:
decision_tree(X_sm, X_test, y_sm, y_test)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5106360963545281
Accuracy Score, Test Set: 0.4982809852410304
Confusion Matrix: 
 [[ 5987 10361  7759]
 [23806 72353 35923]
 [ 1539  2334  2822]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.25      0.22     24107
        Good       0.85      0.55      0.67    132082
        Poor       0.06      0.42      0.11      6695

    accuracy                           0.50    162884
   macro avg       0.37      0.41      0.33    162884
weighted avg       0.72      0.50      0.58    162884



## Random Forest Classifier

In [29]:
random_forest(X_sm, X_test, y_sm, y_test)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5106226366683911
Accuracy Score, Test Set: 0.5019830063112399
Confusion Matrix: 
 [[ 5818 10535  7754]
 [22995 73124 35963]
 [ 1496  2376  2823]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.24      0.21     24107
        Good       0.85      0.55      0.67    132082
        Poor       0.06      0.42      0.11      6695

    accuracy                           0.50    162884
   macro avg       0.37      0.41      0.33    162884
weighted avg       0.72      0.50      0.58    162884



## Gaussian Naive Bayes

In [30]:
gaussian(X_sm, X_test, y_sm, y_test)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.3710759757220911
Accuracy Score, Test Set: 0.1569890228628963
Confusion Matrix: 
 [[  2414   2248  19445]
 [ 10362  17508 104212]
 [   525    521   5649]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.10      0.13     24107
        Good       0.86      0.13      0.23    132082
        Poor       0.04      0.84      0.08      6695

    accuracy                           0.16    162884
   macro avg       0.36      0.36      0.15    162884
weighted avg       0.73      0.16      0.21    162884



## Categorical Naive Bayes

In [31]:
categorical_naive_bayes(X_sm, X_test, y_sm, y_test)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4187173760341876
Accuracy Score, Test Set: 0.5069804277891015
Confusion Matrix: 
 [[ 6955 10432  6720]
 [30959 72918 28205]
 [ 1519  2470  2706]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.29      0.22     24107
        Good       0.85      0.55      0.67    132082
        Poor       0.07      0.40      0.12      6695

    accuracy                           0.51    162884
   macro avg       0.37      0.41      0.34    162884
weighted avg       0.72      0.51      0.58    162884



# ADASYN - Adaptive Synthetic

In [32]:
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_ada))

Resampled dataset shape Counter({'Poor': 399940, 'Good': 396245, 'Fair': 387605})


In [33]:
# train test split
# X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

## Logistic Regression

In [34]:
logreg(X_ada, X_test, y_ada, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41523158668344895
Accuracy Score, Test Set:  0.444408290562609

Confusion Matrix: 
 [[ 3997  9697 10413]
 [17588 64639 49855]
 [  840  2104  3751]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.17      0.17     24107
        Good       0.85      0.49      0.62    132082
        Poor       0.06      0.56      0.11      6695

    accuracy                           0.44    162884
   macro avg       0.36      0.41      0.30    162884
weighted avg       0.71      0.44      0.53    162884



## KNN Classifier

In [35]:
knn(X_ada, X_test, y_ada, y_test)

KNN Classifier 

Accuracy Score, Training Set:  0.41389266677366765
Accuracy Score, Test Set:  0.5410230593551238

Confusion Matrix: 
 [[ 9252 12409  2446]
 [44857 77947  9278]
 [ 2436  3334   925]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.16      0.38      0.23     24107
        Good       0.83      0.59      0.69    132082
        Poor       0.07      0.14      0.10      6695

    accuracy                           0.54    162884
   macro avg       0.36      0.37      0.34    162884
weighted avg       0.70      0.54      0.60    162884



## Decision Tree Classifier

In [36]:
decision_tree(X_ada, X_test, y_ada, y_test)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5006301793392409
Accuracy Score, Test Set: 0.44144299010338645
Confusion Matrix: 
 [[ 7087  8760  8260]
 [31630 61904 38548]
 [ 1811  1971  2913]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.29      0.22     24107
        Good       0.85      0.47      0.60    132082
        Poor       0.06      0.44      0.10      6695

    accuracy                           0.44    162884
   macro avg       0.36      0.40      0.31    162884
weighted avg       0.72      0.44      0.53    162884



## Random Forest Classifier

In [37]:
random_forest(X_ada, X_test, y_ada, y_test)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5006031475177185
Accuracy Score, Test Set: 0.4452984946342182
Confusion Matrix: 
 [[ 6824  8983  8300]
 [30505 62750 38827]
 [ 1718  2019  2958]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.28      0.22     24107
        Good       0.85      0.48      0.61    132082
        Poor       0.06      0.44      0.10      6695

    accuracy                           0.45    162884
   macro avg       0.36      0.40      0.31    162884
weighted avg       0.72      0.45      0.53    162884



## Gaussian Naive Bayes

In [38]:
gaussian(X_ada, X_test, y_ada, y_test)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.3654981035487713
Accuracy Score, Test Set: 0.11891898529014513
Confusion Matrix: 
 [[  2043   1717  20347]
 [ 10956  11456 109670]
 [   393    431   5871]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.15      0.08      0.11     24107
        Good       0.84      0.09      0.16    132082
        Poor       0.04      0.88      0.08      6695

    accuracy                           0.12    162884
   macro avg       0.35      0.35      0.12    162884
weighted avg       0.71      0.12      0.15    162884



## Categorical Naive Bayes

In [39]:
categorical_naive_bayes(X_ada, X_test, y_ada, y_test)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4102484393346793
Accuracy Score, Test Set: 0.44108690847474274
Confusion Matrix: 
 [[ 6313  8881  8913]
 [28892 62220 40970]
 [ 1359  2023  3313]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.26      0.21     24107
        Good       0.85      0.47      0.61    132082
        Poor       0.06      0.49      0.11      6695

    accuracy                           0.44    162884
   macro avg       0.36      0.41      0.31    162884
weighted avg       0.72      0.44      0.53    162884



# Combination Over-sampling and Under-sampling

In [40]:
from imblearn.combine import SMOTETomek

In [41]:
smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_sample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_smt))

Resampled dataset shape Counter({'Poor': 396226, 'Fair': 396145, 'Good': 396134})


In [42]:
# train test split
# X_train_smt, X_test_smt, y_train_smt, y_test_smt = train_test_split(X_smt, y_smt, test_size=0.25, random_state=42)

## Logistic Regression

In [43]:
logreg(X_smt, X_test, y_smt, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.424885044656943
Accuracy Score, Test Set:  0.4710898553571867

Confusion Matrix: 
 [[ 5531 10022  8554]
 [22401 68038 41643]
 [ 1345  2186  3164]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.23      0.21     24107
        Good       0.85      0.52      0.64    132082
        Poor       0.06      0.47      0.11      6695

    accuracy                           0.47    162884
   macro avg       0.37      0.41      0.32    162884
weighted avg       0.72      0.47      0.55    162884



## KNN Classifier

In [44]:
knn(X_smt, X_test, y_smt, y_test)

KNN Classifier 

Accuracy Score, Training Set:  0.4417112254470953
Accuracy Score, Test Set:  0.4511185874610152

Confusion Matrix: 
 [[11136  9567  3404]
 [57817 61126 13139]
 [ 2981  2496  1218]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.15      0.46      0.23     24107
        Good       0.84      0.46      0.60    132082
        Poor       0.07      0.18      0.10      6695

    accuracy                           0.45    162884
   macro avg       0.35      0.37      0.31    162884
weighted avg       0.70      0.45      0.52    162884



## Decision Tree Classifier

In [45]:
decision_tree(X_smt, X_test, y_smt, y_test)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5105413944409153
Accuracy Score, Test Set: 0.4983792146558287
Confusion Matrix: 
 [[ 5991 10370  7746]
 [23796 72363 35923]
 [ 1534  2337  2824]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.25      0.22     24107
        Good       0.85      0.55      0.67    132082
        Poor       0.06      0.42      0.11      6695

    accuracy                           0.50    162884
   macro avg       0.37      0.41      0.33    162884
weighted avg       0.72      0.50      0.58    162884



## Random Forest Classifier

In [46]:
random_forest(X_smt, X_test, y_smt, y_test)

Random Forest Classifier 

Accuracy Score, Training Set: 0.510518676825087
Accuracy Score, Test Set: 0.503051251197171
Confusion Matrix: 
 [[ 5802 10545  7760]
 [22795 73308 35979]
 [ 1477  2389  2829]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.24      0.21     24107
        Good       0.85      0.56      0.67    132082
        Poor       0.06      0.42      0.11      6695

    accuracy                           0.50    162884
   macro avg       0.37      0.41      0.33    162884
weighted avg       0.72      0.50      0.58    162884



## Gaussian Naive Bayes

In [47]:
gaussian(X_smt, X_test, y_smt, y_test)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.37112591028224534
Accuracy Score, Test Set: 0.15695832617077185
Confusion Matrix: 
 [[  2408   2248  19451]
 [ 10351  17509 104222]
 [   525    521   5649]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.10      0.13     24107
        Good       0.86      0.13      0.23    132082
        Poor       0.04      0.84      0.08      6695

    accuracy                           0.16    162884
   macro avg       0.36      0.36      0.15    162884
weighted avg       0.73      0.16      0.21    162884



## Categorical Naive Bayes

In [48]:
categorical_naive_bayes(X_smt, X_test, y_smt, y_test)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4183305917938923
Accuracy Score, Test Set: 0.5074838535399425
Confusion Matrix: 
 [[ 6949 10459  6699]
 [30941 73010 28131]
 [ 1519  2474  2702]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.29      0.22     24107
        Good       0.85      0.55      0.67    132082
        Poor       0.07      0.40      0.12      6695

    accuracy                           0.51    162884
   macro avg       0.37      0.41      0.34    162884
weighted avg       0.72      0.51      0.58    162884



# SMOTE Extensions - Borderline SMOTE

In [49]:
from imblearn.over_sampling import BorderlineSMOTE

In [50]:
bsmt = BorderlineSMOTE(random_state=42)
X_bsmt, y_bsmt = bsmt.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_bsmt))

Resampled dataset shape Counter({'Good': 396245, 'Fair': 396245, 'Poor': 396245})


In [51]:
# train test split
# X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt = train_test_split(X_bsmt, y_bsmt, test_size=0.25, random_state=42)

## Logistic Regression

In [52]:
logreg(X_bsmt, X_test, y_bsmt, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4651709590446988
Accuracy Score, Test Set:  0.5187863755801675

Confusion Matrix: 
 [[ 4632 11385  8090]
 [19686 76755 35641]
 [ 1040  2540  3115]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.19      0.19     24107
        Good       0.85      0.58      0.69    132082
        Poor       0.07      0.47      0.12      6695

    accuracy                           0.52    162884
   macro avg       0.37      0.41      0.33    162884
weighted avg       0.72      0.52      0.59    162884



## KNN Classifier

In [53]:
knn(X_bsmt, X_test, y_bsmt, y_test)

KNN Classifier 

Accuracy Score, Training Set:  0.5120880599965509
Accuracy Score, Test Set:  0.5333795830161342

Confusion Matrix: 
 [[ 9439 11769  2899]
 [44991 76375 10716]
 [ 2529  3101  1065]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.39      0.23     24107
        Good       0.84      0.58      0.68    132082
        Poor       0.07      0.16      0.10      6695

    accuracy                           0.53    162884
   macro avg       0.36      0.38      0.34    162884
weighted avg       0.71      0.53      0.59    162884



## Decision Tree Classifier

In [54]:
decision_tree(X_bsmt, X_test, y_bsmt, y_test)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.6194711184578564
Accuracy Score, Test Set: 0.4364087325949756
Confusion Matrix: 
 [[ 8589  8531  6987]
 [39808 59952 32322]
 [ 2205  1947  2543]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.36      0.23     24107
        Good       0.85      0.45      0.59    132082
        Poor       0.06      0.38      0.10      6695

    accuracy                           0.44    162884
   macro avg       0.36      0.40      0.31    162884
weighted avg       0.72      0.44      0.52    162884



## Random Forest Classifier

In [55]:
random_forest(X_bsmt, X_test, y_bsmt, y_test)

Random Forest Classifier 

Accuracy Score, Training Set: 0.6194652298451715
Accuracy Score, Test Set: 0.4409088676604209
Confusion Matrix: 
 [[ 8485  8693  6929]
 [39098 60798 32186]
 [ 2167  1994  2534]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.35      0.23     24107
        Good       0.85      0.46      0.60    132082
        Poor       0.06      0.38      0.10      6695

    accuracy                           0.44    162884
   macro avg       0.36      0.40      0.31    162884
weighted avg       0.72      0.44      0.52    162884



## Gaussian Naive Bayes

In [56]:
gaussian(X_bsmt, X_test, y_bsmt, y_test)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.4091778234846286
Accuracy Score, Test Set: 0.2671901475896957
Confusion Matrix: 
 [[ 2137  4713 17257]
 [ 9983 36127 85972]
 [  456   982  5257]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.09      0.12     24107
        Good       0.86      0.27      0.42    132082
        Poor       0.05      0.79      0.09      6695

    accuracy                           0.27    162884
   macro avg       0.36      0.38      0.21    162884
weighted avg       0.73      0.27      0.36    162884



## Categorical Naive Bayes

In [57]:
categorical_naive_bayes(X_bsmt, X_test, y_bsmt, y_test)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4647520263136864
Accuracy Score, Test Set: 0.5686746396208344
Confusion Matrix: 
 [[ 6453 12181  5473]
 [29629 83903 18550]
 [ 1405  3018  2272]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.27      0.21     24107
        Good       0.85      0.64      0.73    132082
        Poor       0.09      0.34      0.14      6695

    accuracy                           0.57    162884
   macro avg       0.37      0.41      0.36    162884
weighted avg       0.72      0.57      0.63    162884



# SVM-SMOTE

In [58]:
from imblearn.over_sampling import SVMSMOTE

In [59]:
svmsm = SVMSMOTE(random_state=42)
X_svmsm, y_svmsm = svmsm.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_svmsm))

Resampled dataset shape Counter({'Good': 396245, 'Fair': 396245, 'Poor': 396245})


In [60]:
# train test split
# X_train_svmsm, X_test_svmsm, y_train_svmsm, y_test_svmsm = train_test_split(X_svmsm, y_svmsm, test_size=0.25, random_state=42)

## Logistic Regression

In [61]:
logreg(X_svmsm, X_test, y_svmsm, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.5867228608562884
Accuracy Score, Test Set:  0.720960929250264

Confusion Matrix: 
 [[  4934  17359   1814]
 [ 17179 111605   3298]
 [  1393   4408    894]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.21      0.20      0.21     24107
        Good       0.84      0.84      0.84    132082
        Poor       0.15      0.13      0.14      6695

    accuracy                           0.72    162884
   macro avg       0.40      0.39      0.40    162884
weighted avg       0.72      0.72      0.72    162884



## KNN Classifier

In [62]:
knn(X_svmsm, X_test, y_svmsm, y_test)

KNN Classifier 

Accuracy Score, Training Set:  0.6602253656197554
Accuracy Score, Test Set:  0.6128410402495027

Confusion Matrix: 
 [[ 7140 14528  2439]
 [31771 91649  8662]
 [ 1828  3834  1033]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.18      0.30      0.22     24107
        Good       0.83      0.69      0.76    132082
        Poor       0.09      0.15      0.11      6695

    accuracy                           0.61    162884
   macro avg       0.36      0.38      0.36    162884
weighted avg       0.70      0.61      0.65    162884



## Decision Tree Classifier

In [63]:
decision_tree(X_svmsm, X_test, y_svmsm, y_test)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.7173137831392195
Accuracy Score, Test Set: 0.6405969892684364
Confusion Matrix: 
 [[ 4383 14831  4893]
 [13744 97989 20349]
 [ 1230  3494  1971]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.23      0.18      0.20     24107
        Good       0.84      0.74      0.79    132082
        Poor       0.07      0.29      0.12      6695

    accuracy                           0.64    162884
   macro avg       0.38      0.41      0.37    162884
weighted avg       0.72      0.64      0.67    162884



## Random Forest Classifier

In [64]:
random_forest(X_svmsm, X_test, y_svmsm, y_test)

Random Forest Classifier 

Accuracy Score, Training Set: 0.7173087357569181
Accuracy Score, Test Set: 0.645367255224577
Confusion Matrix: 
 [[ 4158 15106  4843]
 [12784 98998 20300]
 [ 1162  3569  1964]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.23      0.17      0.20     24107
        Good       0.84      0.75      0.79    132082
        Poor       0.07      0.29      0.12      6695

    accuracy                           0.65    162884
   macro avg       0.38      0.41      0.37    162884
weighted avg       0.72      0.65      0.68    162884



## Gaussian Naive Bayes

In [65]:
gaussian(X_svmsm, X_test, y_svmsm, y_test)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.5074781174946477
Accuracy Score, Test Set: 0.587117212249208
Confusion Matrix: 
 [[ 2710 14429  6968]
 [ 9700 90571 31811]
 [  745  3599  2351]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.21      0.11      0.15     24107
        Good       0.83      0.69      0.75    132082
        Poor       0.06      0.35      0.10      6695

    accuracy                           0.59    162884
   macro avg       0.37      0.38      0.33    162884
weighted avg       0.71      0.59      0.64    162884



## Categorical Naive Bayes

In [66]:
categorical_naive_bayes(X_svmsm, X_test, y_svmsm, y_test)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.5874126697708068
Accuracy Score, Test Set: 0.699663564254316
Confusion Matrix: 
 [[  6182  16464   1461]
 [ 22407 107017   2658]
 [  1576   4354    765]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.20      0.26      0.23     24107
        Good       0.84      0.81      0.82    132082
        Poor       0.16      0.11      0.13      6695

    accuracy                           0.70    162884
   macro avg       0.40      0.39      0.39    162884
weighted avg       0.72      0.70      0.71    162884

