# General Overview - Machine Learning

We are using random forest, decision trees, logistic regression, and SVC to model our data.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# from sklearn.metrics import plot_precision_recall_curve ## does not work?

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB

In [2]:
## Unbalanced data set - handle this in the machine learning
## Oversampling or undersampling or random sampling

## Statistical analysis is not for target variables

## Stratified sampling - happens during machine learning
## Python has lots of APIs to apply stratified sampling

In [3]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [4]:
tree.head()

Unnamed: 0,health,health_labels,num_problems,tree_dbh,root_stone_labels,root_grate_labels,root_other_labels,trunk_wire_labels,trnk_light_labels,trnk_other_labels,...,Helpful,None.1,Unsure,Damage,NoDamage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0


In [5]:
tree.shape

(651535, 30)

# Machine Learning Models

Description goes here.

## Target and Response Variable, Train_Test_Split

In [6]:
tree_ml = tree.drop(columns='health') # keep the numerical health column: health_labels

In [7]:
# create targe and response variable
y = tree_ml['health_labels'].values
X = tree_ml.drop('health_labels', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=23456789, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 28) (488651,)
(162884, 28) (162884,)


## Baseline - DummyClassifier

We start by using the DummyClassifier to make predictions using simple rules.

In [8]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6816978366473021


In [9]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6813954737657992


In [10]:
## add precision and recall score to this part

## Logistic Regression

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

Accuracy Score, Training Set:  0.8106992516131145
Accuracy Score, Test Set:  0.8109267945286216


In [13]:
# similar scores, no overfitting
# look into precision and recall

# need to know how the model performs for each category

In [14]:
# classification report

print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.36      0.00      0.00      6695
           1       0.37      0.02      0.03     24107
           2       0.81      1.00      0.90    132082

    accuracy                           0.81    162884
   macro avg       0.51      0.34      0.31    162884
weighted avg       0.73      0.81      0.73    162884



In [15]:
## rebalance data - results consistently horrible

## oversampling or undersampling to obtain better results

## send article about sampling

In [16]:
# cross validation - 5-fold

cv_scores = cross_val_score(logreg, X, y, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [17]:
print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

[0.80919674 0.8115297  0.81120738 0.81205921 0.80682542]
Average 5-Fold CV Score: 0.8101636903619912


In [18]:
# model performance - confusion matrix

cm = confusion_matrix(y_test, y_pred_logreg)
print ("Confusion Matrix: \n", cm) ## why is the matrix 3x3?

Confusion Matrix: 
 [[     5    301   6389]
 [     7    418  23682]
 [     2    416 131664]]


In [19]:
## add labels, calculate scores

## KNN Classifier

This one takes a long time to run.

In [20]:
# response and target variables are X and y, respectively
# split with train_test_split

# create KNN with 7 neighbbors
knn = KNeighborsClassifier(n_neighbors=7)

# fit to training data
knn.fit(X_train, y_train)

# accuracy scoring
print(knn.score(X_test, y_test))

0.7934542373713809


In [21]:
# could increasing the number of neighbors increase accuracy scores?

# create KNN with 7 neighbbors
knn = KNeighborsClassifier(n_neighbors=9)

# fit to training data
knn.fit(X_train, y_train)

# accuracy scoring
print(knn.score(X_test, y_test))

0.7985069128950665


In [22]:
# increase to 15!

# create KNN with 7 neighbbors
knn = KNeighborsClassifier(n_neighbors=15)

# fit to training data
knn.fit(X_train, y_train)

# predict
y_pred = knn.predict(X_test)

# accuracy scoring
print(knn.score(X_test, y_test))

0.8062179219567299


In [23]:
## use GridSearch

In [24]:
## look at test score and training score

In [25]:
# cross validation
cv_scores_knn = cross_val_score(knn, X, y, cv=5)

print('CV scores: ', cv_scores_knn)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_knn)))

CV scores:  [0.79946588 0.79934309 0.8065645  0.80951138 0.79018779]
Average 5-Fold CV Score: 0.8010145272318449


In [26]:
# confusion matrix

cm_knn = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm_knn) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   181    370   6144]
 [   172    927  23008]
 [   129   1741 130212]]


In [27]:
# classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.38      0.03      0.05      6695
           1       0.31      0.04      0.07     24107
           2       0.82      0.99      0.89    132082

    accuracy                           0.81    162884
   macro avg       0.50      0.35      0.34    162884
weighted avg       0.72      0.81      0.74    162884



## Decision Tree Classifier

In [28]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

y_pred_decision_tree = decision_tree.predict(X_test)

In [29]:
# accuracy score
print(decision_tree.score(X_test, y_test))

0.7996795265342207


In [30]:
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

Accuracy Score, Training Set: 0.8255851313104854
Accuracy Score, Test Set: 0.7996795265342207


In [31]:
# confusion matrix

cm_decision_tree = confusion_matrix(y_test, y_pred_decision_tree)
print ("Confusion Matrix: \n", cm_decision_tree) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   305    461   5929]
 [   477   1401  22229]
 [   720   2813 128549]]


In [32]:
# classification report

print(classification_report(y_test, y_pred_decision_tree))

              precision    recall  f1-score   support

           0       0.20      0.05      0.07      6695
           1       0.30      0.06      0.10     24107
           2       0.82      0.97      0.89    132082

    accuracy                           0.80    162884
   macro avg       0.44      0.36      0.35    162884
weighted avg       0.72      0.80      0.74    162884



In [33]:
# cross validation

cv_scores_dtc = cross_val_score(decision_tree, X, y, cv=5)

print('CV scores: ', cv_scores_dtc)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_dtc)))

CV scores:  [0.78996524 0.80413178 0.80542104 0.80261997 0.77589846]
Average 5-Fold CV Score: 0.7956072966149171


## Random Forest Classifier

In [34]:
forest = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

y_pred_forest = forest.predict(X_test)

In [35]:
# accuracy score
print('Accuracy:',metrics.accuracy_score(y_test, y_pred_forest))

Accuracy: 0.8044375138135115


In [36]:
## do GridSearch for optimal parameters
## training and test scores

In [37]:
# confusion matrix

cm_forest = confusion_matrix(y_test, y_pred_forest)
print ('Confusion Matrix: \n', cm_forest) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   237    399   6059]
 [   310   1156  22641]
 [   390   2055 129637]]


In [38]:
# classification report

print(classification_report(y_test, y_pred_forest))

              precision    recall  f1-score   support

           0       0.25      0.04      0.06      6695
           1       0.32      0.05      0.08     24107
           2       0.82      0.98      0.89    132082

    accuracy                           0.80    162884
   macro avg       0.46      0.35      0.35    162884
weighted avg       0.72      0.80      0.74    162884



In [39]:
# cross validation - 5-fold

cv_scores_forest = cross_val_score(forest, X, y, cv=5)

print(cv_scores_forest)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_forest)))

[0.79714827 0.80763121 0.80816073 0.80621149 0.78605908]
Average 5-Fold CV Score: 0.8010421542971597


## Gaussian Naive Bayes

In [40]:
gaussian = GaussianNB().fit(X_train, y_train)

y_pred_gaussian = gaussian.predict(X_test)

In [41]:
# accuracy score

print('Accuracy:',metrics.accuracy_score(y_test, y_pred_gaussian))

Accuracy: 0.7326870656418064


In [42]:
# confusion matrix

cm_gaussian = confusion_matrix(y_test, y_pred_gaussian)
print ('Confusion Matrix: \n', cm_gaussian) ## why is the matrix 9x9?

Confusion Matrix: 
 [[  1236    707   4752]
 [  3080   2865  18162]
 [  6732  10108 115242]]


In [43]:
# classification report

print(classification_report(y_test, y_pred_gaussian))

              precision    recall  f1-score   support

           0       0.11      0.18      0.14      6695
           1       0.21      0.12      0.15     24107
           2       0.83      0.87      0.85    132082

    accuracy                           0.73    162884
   macro avg       0.39      0.39      0.38    162884
weighted avg       0.71      0.73      0.72    162884



In [44]:
# cross validation - 5-fold

cv_scores_gaussian = cross_val_score(knn, X, y, cv=5)

print(cv_scores_gaussian)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_gaussian)))

[0.79946588 0.79934309 0.8065645  0.80951138 0.79018779]
Average 5-Fold CV Score: 0.8010145272318449


## Categorical Naive Bayes

In [45]:
categorical = CategoricalNB().fit(X_train, y_train)

y_pred_cate = categorical.predict(X_test)

In [46]:
# accuracy score

print('Accuracy:', metrics.accuracy_score(y_test, y_pred_cate))

Accuracy: 0.8000049114707399


In [47]:
# confusion matrix

cm_categorical = confusion_matrix(y_test, y_pred_cate)
print ('Confusion Matrix: \n', cm_categorical) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   244    548   5903]
 [   265   1542  22300]
 [   290   3270 128522]]


In [48]:
# classification report

print(classification_report(y_test, y_pred_cate))

              precision    recall  f1-score   support

           0       0.31      0.04      0.07      6695
           1       0.29      0.06      0.10     24107
           2       0.82      0.97      0.89    132082

    accuracy                           0.80    162884
   macro avg       0.47      0.36      0.35    162884
weighted avg       0.72      0.80      0.74    162884

