# General Overview - Machine Learning

We are using random forest, decision trees, logistic regression, and SVC to model our data.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# from sklearn.metrics import plot_precision_recall_curve ## does not work?

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB

# Categorical Naive Bayes
# from sklearn.naive_bayes import CategoricalNB
## that one doesn't load?

In [2]:
## Unbalanced data set - handle this in the machine learning
## Oversampling or undersampling or random sampling

## Statistical analysis is not for target variables

## Stratified sampling - happens during machine learning
## Python has lots of APIs to apply stratified sampling

In [3]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [4]:
tree.head()

Unnamed: 0,health,health_labels,num_problems,tree_dbh,root_stone_labels,root_grate_labels,root_other_labels,trunk_wire_labels,trnk_light_labels,trnk_other_labels,...,Helpful,None.1,Unsure,Damage,NoDamage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0


In [5]:
tree.shape

# take a subsample and run experiment, then run on entire dataset
# knn and svc takes a long time - try a smaller set

(651535, 30)

In [6]:
# measure model performance - does this look okay?

def cross_validate(model, nfolds, feats, targets):
    score = -1 * (cross_val_score(model, feats, targets, cv=nfolds, scoring='neg_mean_absolute_error'))
    return np.mean(score)

# Machine Learning Models

Description goes here.

## Target and Response Variable, Train_Test_Split

In [7]:
tree_ml = tree.drop(columns='health') # keep the numerical health column: health_labels

In [8]:
# create targe and response variable
y = tree_ml['health_labels'].values
X = tree_ml.drop('health_labels', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=23456789, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(521228, 28) (521228,)
(130307, 28) (130307,)


## Baseline - DummyClassifier

We start by using the DummyClassifier to make predictions using simple rules.

In [9]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6804991289800241


In [10]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6801369074570054


## Logistic Regression

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

Accuracy Score, Training Set:  0.8106605938284206
Accuracy Score, Test Set:  0.8109925023214409


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
# similar scores, no overfitting
# look into precision and recall

# need to know how the model performs for each category

In [13]:
# classification report

print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.45      0.00      0.00      5356
           1       0.37      0.02      0.04     19285
           2       0.81      1.00      0.90    105666

    accuracy                           0.81    130307
   macro avg       0.55      0.34      0.31    130307
weighted avg       0.73      0.81      0.73    130307



In [14]:
# cross validation - 5-fold

cv_scores = cross_val_score(logreg, X, y, cv=5)

print(cv_scores)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[0.80919674 0.8115297  0.81120738 0.81205921 0.80682542]
Average 5-Fold CV Score: 0.8101636903619912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
# mean absolute error

print("MAE Score: ", cross_validate(logreg, 10, X_train, y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

MAE Score:  0.2283396131156333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
# model performance - confusion matrix

cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm) ## why is the matrix 9x9?

Confusion Matrix: 
 [[     5    264   5087]
 [     4    366  18915]
 [     2    357 105307]]


In [17]:
# model performance - accuracy score

print ("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8109925023214409


In [18]:
# precision score - error message

# average_precision = average_precision_score(y_test, y_pred)

# print('Average precision-recall score: {0:0.2f}'.format(average_precision))

## KNN Classifier

This one takes a long time to run.

In [20]:
# response and target variables are X and y, respectively
# split with train_test_split

# create KNN with 7 neighbbors
knn = KNeighborsClassifier(n_neighbors=7)

# fit to training data
knn.fit(X_train, y_train)

# accuracy scoring
print(knn.score(X_test, y_test))

0.7981305685803526


In [21]:
# could increasing the number of neighbors increase accuracy scores?

# create KNN with 7 neighbbors
knn = KNeighborsClassifier(n_neighbors=9)

# fit to training data
knn.fit(X_train, y_train)

# accuracy scoring
print(knn.score(X_test, y_test))

0.7974475661322876


In [22]:
# increase to 15!

# create KNN with 7 neighbbors
knn = KNeighborsClassifier(n_neighbors=15)

# fit to training data
knn.fit(X_train, y_train)

# predict
y_pred = knn.predict(X_test)

# accuracy scoring
print(knn.score(X_test, y_test))

0.80529058300782


In [23]:
# cross validation
cv_scores_knn = cross_val_score(knn, X, y, cv=5)

print('CV scores: ', cv_scores_knn)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_knn)))

CV scores:  [0.79946588 0.79934309 0.8065645  0.80951138 0.79018779]
Average 5-Fold CV Score: 0.8010145272318449


In [24]:
# mean absolute error - this one takes too long to run so skip for now

# print("MAE Score: ", cross_validate(knn, 10, X_train, y_train))

In [25]:
# confusion matrix

cm_knn = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm_knn) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   156    359   4841]
 [   159    833  18293]
 [   155   1565 103946]]


In [26]:
# classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.03      0.05      5356
           1       0.30      0.04      0.08     19285
           2       0.82      0.98      0.89    105666

    accuracy                           0.81    130307
   macro avg       0.48      0.35      0.34    130307
weighted avg       0.72      0.81      0.74    130307



In [52]:
# cross validation - 5-fold

cv_scores_knn = cross_val_score(knn, X, y, cv=5)

print(cv_scores_knn)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_knn)))

[0.79946588 0.79934309 0.8065645  0.80951138 0.79018779]
Average 5-Fold CV Score: 0.8010145272318449


In [27]:
# f1 score - micro

f1_score(y_test, y_pred, average='micro')

0.8052905830078199

In [28]:
# f1 score - macro

f1_score(y_test, y_pred, average='macro')

0.34078335924573483

In [29]:
# f1 score - None

f1_score(y_test, y_pred, average=None) ## what does this mean?

array([0.05355304, 0.07558298, 0.89321406])

## Decision Tree Classifier

In [31]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

y_pred_decision_tree = decision_tree.predict(X_test)

In [32]:
# accuracy score
print(decision_tree.score(X_test, y_test))

0.8003100370663127


In [33]:
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

Accuracy Score, Training Set: 0.8252089296814445
Accuracy Score, Test Set: 0.8003100370663127


In [34]:
# recall score
recall_score(y_test, y_pred_decision_tree, average='micro')

0.8003100370663127

In [35]:
recall_score(y_test, y_pred_decision_tree, average=None)

array([0.04219567, 0.05781696, 0.97424905])

In [36]:
f1_score(y_test, y_pred_decision_tree, average='micro')

0.8003100370663128

In [37]:
f1_score(y_test, y_pred_decision_tree, average=None)

array([0.06951707, 0.09718893, 0.89065481])

In [38]:
# confusion matrix

cm_decision_tree = confusion_matrix(y_test, y_pred_decision_tree)
print ("Confusion Matrix: \n", cm_decision_tree) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   226    376   4754]
 [   368   1115  17802]
 [   552   2169 102945]]


In [39]:
# classification report

print(classification_report(y_test, y_pred_decision_tree))

              precision    recall  f1-score   support

           0       0.20      0.04      0.07      5356
           1       0.30      0.06      0.10     19285
           2       0.82      0.97      0.89    105666

    accuracy                           0.80    130307
   macro avg       0.44      0.36      0.35    130307
weighted avg       0.72      0.80      0.74    130307



In [51]:
# cross validation

cv_scores_dtc = cross_val_score(decision_tree, X, y, cv=5)

print('CV scores: ', cv_scores_dtc)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_dtc)))

CV scores:  [0.78996524 0.80413178 0.80542104 0.80261997 0.77589846]
Average 5-Fold CV Score: 0.7956072966149171


## Random Forest Classifier

In [41]:
forest = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

y_pred_forest = forest.predict(X_test)

In [42]:
# accuracy score
print('Accuracy:',metrics.accuracy_score(y_test, y_pred_forest))

Accuracy: 0.8047917609951882


In [43]:
# confusion matrix

cm_forest = confusion_matrix(y_test, y_pred_forest)
print ('Confusion Matrix: \n', cm_forest) ## why is the matrix 9x9?

Confusion Matrix: 
 [[   188    326   4842]
 [   232    931  18122]
 [   294   1621 103751]]


In [44]:
# classification report

print(classification_report(y_test, y_pred_forest))

              precision    recall  f1-score   support

           0       0.26      0.04      0.06      5356
           1       0.32      0.05      0.08     19285
           2       0.82      0.98      0.89    105666

    accuracy                           0.80    130307
   macro avg       0.47      0.36      0.35    130307
weighted avg       0.72      0.80      0.74    130307



In [53]:
# cross validation - 5-fold

cv_scores_forest = cross_val_score(forest, X, y, cv=5)

print(cv_scores_forest)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_forest)))

[0.79741687 0.80746238 0.80807631 0.80618079 0.78578281]
Average 5-Fold CV Score: 0.8009838304926058


## Bernoulli Naive Bayes

In [45]:
bernoulli = BernoulliNB().fit(X_train, y_train)

y_pred_bernoulli = bernoulli.predict(X_test)

In [46]:
# accuracy score

print('Accuracy:',metrics.accuracy_score(y_test, y_pred_bernoulli))

Accuracy: 0.8038248137091637


In [47]:
# confusion matrix

cm_bernoulli = confusion_matrix(y_test, y_pred_bernoulli)
print ('Confusion Matrix: \n', cm_bernoulli) ## why is the matrix 9x9?

Confusion Matrix: 
 [[    63    481   4812]
 [    57   1096  18132]
 [    69   2012 103585]]


In [50]:
# classification report

print(classification_report(y_test, y_pred_bernoulli))

              precision    recall  f1-score   support

           0       0.33      0.01      0.02      5356
           1       0.31      0.06      0.10     19285
           2       0.82      0.98      0.89    105666

    accuracy                           0.80    130307
   macro avg       0.49      0.35      0.34    130307
weighted avg       0.72      0.80      0.74    130307



In [49]:
# recall score

recall_score(y_test, y_pred_bernoulli, average='micro')

0.8038248137091637

In [54]:
# cross validation - 5-fold

cv_scores_bernoulli = cross_val_score(knn, X, y, cv=5)

print(cv_scores_bernoulli)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_bernoulli)))

[0.79946588 0.79934309 0.8065645  0.80951138 0.79018779]
Average 5-Fold CV Score: 0.8010145272318449


## Support Vector Machine - SVM - takes a long time

In [None]:
# create SVM classifier
svm_svc = svm.SVC(gamma='auto')

# train the data
svm_svc.fit(X_train, y_train)

# predict results using test data
y_pred = svm_svc.predict(X_test)

In [None]:
# precision score
print("Precision:",metrics.precision_score(y_test, y_pred))

# recall score
print("Recall:",metrics.recall_score(y_test, y_pred))