- Environment Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import export_graphviz

import graphviz
from graphviz import Graph

import wrangle

from wrangle import get_311_data, clean_311

import model

from model import split_separate_scale, model_df

pd.set_option("display.max_rows", None, "display.max_columns", None)

# Acquire and Prep

In [2]:
# Acquire and prep data
df = model_df()

In [3]:
df.head()

Unnamed: 0,open_date,dept,call_reason,source_id,council_district,resolution_days_due,level_of_delay,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste,web_portal,311_mobile_app,constituent_call,internal_services_requests,open_month,open_year,open_week
551,2017-01-27,Customer Service,customer_service,Constituent Call,2,8,Very Late Response,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2017,4
552,2017-01-27,Customer Service,customer_service,Constituent Call,4,8,Very Late Response,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2017,4
553,2017-02-03,Customer Service,customer_service,Constituent Call,2,8,Very Late Response,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2017,5
554,2017-02-03,Customer Service,customer_service,Constituent Call,1,8,Very Late Response,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2017,5
555,2017-02-28,Customer Service,customer_service,Constituent Call,6,8,Very Late Response,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2017,9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 47 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   open_date                   356804 non-null  datetime64[ns]
 1   dept                        356804 non-null  object        
 2   call_reason                 356804 non-null  object        
 3   source_id                   356804 non-null  object        
 4   council_district            356804 non-null  int64         
 5   resolution_days_due         356804 non-null  int64         
 6   level_of_delay              356804 non-null  category      
 7   district_1                  356804 non-null  uint8         
 8   district_2                  356804 non-null  uint8         
 9   district_3                  356804 non-null  uint8         
 10  district_4                  356804 non-null  uint8         
 11  district_5                  356804 no

In [5]:
df.describe()

Unnamed: 0,council_district,resolution_days_due,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste,web_portal,311_mobile_app,constituent_call,internal_services_requests,open_month,open_year,open_week
count,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0
mean,4.802188,32.784952,0.129236,0.132218,0.128648,0.09744,0.140629,0.085532,0.094158,0.054646,0.054728,0.082765,0.160419,0.004762,0.007803,0.262183,0.006603,6e-06,0.473114,0.030731,0.05438,0.000499,0.003756,0.000177,0.264523,0.015353,0.160606,0.004863,0.002214,0.057398,0.009591,0.028301,0.02519,0.017007,0.410522,0.108858,0.084085,0.001256,0.805801,6.345714,2020.316185,26.108614
std,2.780184,45.078669,0.335462,0.338728,0.33481,0.296556,0.347639,0.279671,0.292049,0.227289,0.227448,0.275528,0.366994,0.068841,0.087987,0.439822,0.080991,0.002368,0.499277,0.172589,0.226766,0.02233,0.061168,0.013287,0.44108,0.122952,0.367168,0.069563,0.047002,0.232603,0.097462,0.165832,0.156703,0.129296,0.491929,0.311462,0.277516,0.035412,0.395583,3.227659,0.516766,14.388064
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017.0,1.0
25%,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2020.0,14.0
50%,5.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,2020.0,26.0
75%,7.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9.0,2021.0,38.0
max,10.0,934.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,2021.0,53.0


In [6]:
df.isnull().sum()

open_date                     0
dept                          0
call_reason                   0
source_id                     0
council_district              0
resolution_days_due           0
level_of_delay                0
district_1                    0
district_2                    0
district_3                    0
district_4                    0
district_5                    0
district_6                    0
district_7                    0
district_8                    0
district_9                    0
district_10                   0
animal_care_services          0
code_enforcement_services     0
customer_services             0
development_services          0
metro_health                  0
parks_and_rec                 0
solid_waste_management        0
trans_and_cap_improvements    0
unknown_dept                  0
buildings                     0
business                      0
cleanup                       0
code                          0
customer_service              0
field   

# Modeling

## Split Data 

In [7]:
# Run split function
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = split_separate_scale(df)

## Establish Baseline

In [8]:
# look at values of target variable
y_train.value_counts()

Very Early Response    114280
Early Response          36971
On Time Response        24417
Late Response           20935
Very Late Response       3207
Name: level_of_delay, dtype: int64

In [9]:
# set up as dataframes
y_train = pd.DataFrame(dict(actual=y_train))
y_validate = pd.DataFrame(dict(actual=y_validate))
y_test = pd.DataFrame(dict(actual=y_test))

In [10]:
# 'Extremely Early Response' is by far the most frequent value so that will be our baseline
y_train['baseline'] = 'Very Early Response'

In [11]:
# calculate accuracy of baseline
baseline_accuracy = accuracy_score(y_train.actual, y_train.baseline)
print(' Baseline Accuracy: {:.2%}'.format(accuracy_score(y_train.actual, y_train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.baseline, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.baseline))

 Baseline Accuracy: 57.19%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
baseline                                                                     
Very Early Response               114280           36971             24417   

actual               Late Response  Very Late Response  
baseline                                                
Very Early Response          20935                3207  
---
                     precision    recall  f1-score   support

     Early Response       0.00      0.00      0.00     36971
      Late Response       0.00      0.00      0.00     20935
   On Time Response       0.00      0.00      0.00     24417
Very Early Response       0.57      1.00      0.73    114280
 Very Late Response       0.00      0.00      0.00      3207

           accuracy                           0.57    199810
          macro avg       0.11      0.20      0.15    199810
       weighted avg       0.33      0.57      0.42  

### Takeaways

- Baseline accuracy is 57.19%, which means we can assume approx 1/2 of cases are resolved very early

- Our goal will be to outperform this baseline

## Logistic Regression - Default Parameters / Best Features

- The first model we looked at was logistic regression

- Various iterations were made, but the logistic regression model that worked the best turned out to be a model using default parameters and the top 8 features chosed by select k best

In [12]:
log_features = ['resolution_days_due', 'customer_services', 'development_services', 'solid_waste_management', 'customer_service', 'field', '311_mobile_app', 'open_year']
# make logistic regression object
log = LogisticRegression(random_state=123)
# fit logistic regression object to data
log.fit(train_scaled[log_features], y_train.actual)
# calculate probability
log_pred_proba = log.predict_proba(train_scaled[log_features])
# make predictions
y_train['log_pred'] = log.predict(train_scaled[log_features])
y_validate['log_pred'] = log.predict(validate_scaled[log_features])
y_test['log_pred'] = log.predict(test_scaled[log_features])

In [13]:
# calculate accuracy of log on train
log_accuracy_train = accuracy_score(y_train.actual, y_train.log_pred)                                     
print(' log Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log_pred))

 log Accuracy: 58.02813%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
log_pred                                                                     
Very Early Response               114280           36971             24417   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
log_pred                                                
Very Early Response          20926                1541  
Very Late Response               9                1666  
---
                     precision    recall  f1-score   support

     Early Response       0.00      0.00      0.00     36971
      Late Response       0.00      0.00      0.00     20935
   On Time Response       0.00      0.00      0.00     24417
Very Early Response       0.58      1.00      0.73    114280
 Very Late Response       0.99      0.52      0.68      3207

           accuracy                      

In [14]:
# calculate accuracy of log on validate
log_accuracy_validate = accuracy_score(y_validate.actual, y_validate.log_pred)
print(' Log Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log_pred))

 Log Accuracy: 58.01969%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
log_pred                                                                     
Very Early Response                48978           15845             10464   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
log_pred                                                
Very Early Response           8965                 668  
Very Late Response               7                 706  
---
                     precision    recall  f1-score   support

     Early Response       0.00      0.00      0.00     15845
      Late Response       0.00      0.00      0.00      8972
   On Time Response       0.00      0.00      0.00     10464
Very Early Response       0.58      1.00      0.73     48978
 Very Late Response       0.99      0.51      0.68      1374

           accuracy                      

### Takeaways

- This model only outperforms the baseline by about 4%

- an improvement, but not by much

## Decision Tree - Deeper Max Depth - Selected Features

- Next we looked at the Decision Tree model

- The best iteration of this model was found by doubling the max depth parameter as well as only incorporating features we found to be significant

In [15]:
# make seperate df by removing incompatible features 
dt_train = X_train.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
dt_validate = X_validate.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
dt_test = X_test.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])

In [16]:
#make the thing
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
#fit the thing
clf = clf.fit(dt_train, y_train.actual)
#predicitons
y_train['dt_pred'] = clf.predict(dt_train)
y_validate['dt_pred'] = clf.predict(dt_validate)
y_test['dt_pred'] = clf.predict(dt_test)
# make visualization of Decision Tree
dot_data = export_graphviz(clf, feature_names= dt_train.columns, class_names=clf.classes_, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('311_decision_tree', view=True)

'311_decision_tree.pdf'

In [17]:
# calculate accuracy of dt on train
dt_accuracy_train = accuracy_score(y_train.actual, y_train.dt_pred)                                     
print(' DT Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.dt_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.dt_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.dt_pred))

 DT Accuracy: 67.24638%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
dt_pred                                                                      
Early Response                      1464           10565              4489   
Late Response                        202             355               697   
On Time Response                       0               0              4473   
Very Early Response               112614           26051             14758   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
dt_pred                                                 
Early Response                1627                 347  
Late Response                 4649                 411  
On Time Response               581                 114  
Very Early Response          14077                 271  
Very Late Response               1                2064  
---
     

In [18]:
# calculate accuracy of dt on validate
dt_accuracy_validate = accuracy_score(y_validate.actual, y_validate.dt_pred)                                     
print(' DT Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.dt_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.dt_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.dt_pred))

 DT Accuracy: 67.33853%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
dt_pred                                                                      
Early Response                       655            4659              1930   
Late Response                         79             161               308   
On Time Response                       0               0              1892   
Very Early Response                48244           11025              6334   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
dt_pred                                                 
Early Response                 737                 149  
Late Response                 1995                 175  
On Time Response               253                  44  
Very Early Response           5987                 132  
Very Late Response               0                 874  
---
     

### Takeaways

- This model improves on the baseline by about 10%

- Much better but can still be improved 

## K Nearest Neighbors - Uniform Weight / Best Features

- Next we looked at KNN Models

- The model that performed the best was a model using uniform weight and selecting features that were found to be significant according to select k best

In [19]:
#Features I want to train on
features1 = ['resolution_days_due', 'customer_services', 'development_services', 'solid_waste_management', 'customer_service', 'field', '311_mobile_app', 'open_year']
#make model
#weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=6, weights='uniform')
#fit model
knn.fit(X_train[features1], y_train.actual)
#predicitons
y_train['knn_pred'] = knn.predict(X_train[features1])
y_validate['knn_pred'] = knn.predict(X_validate[features1])
y_test['knn_pred'] = knn.predict(X_test[features1])
#estimate probability
knn_pred_proba = knn.predict_proba(X_train[features1])

In [20]:
# calculate accuracy of knn on train
knn_accuracy_train = accuracy_score(y_train.actual, y_train.knn_pred)                                     
print(' KNN Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.knn_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.knn_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.knn_pred))

 KNN Accuracy: 59.51204%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
knn_pred                                                                     
Early Response                      4836           11540              5180   
Late Response                       4597             747               687   
On Time Response                    9610            3460              7404   
Very Early Response                95178           21212             11144   
Very Late Response                    59              12                 2   

actual               Late Response  Very Late Response  
knn_pred                                                
Early Response                1974                 321  
Late Response                 2782                  23  
On Time Response              3131                 432  
Very Early Response          13025                 424  
Very Late Response              23                2007  
---
    

In [21]:
# calculate accuracy of knn on validate
knn_accuracy_validate = accuracy_score(y_validate.actual, y_validate.knn_pred)                                     
print(' KNN Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.knn_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.knn_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.knn_pred))

 KNN Accuracy: 59.84842%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
knn_pred                                                                     
Early Response                      2079            5137              2200   
Late Response                       1916             313               278   
On Time Response                    4096            1501              3183   
Very Early Response                40853            8894              4802   
Very Late Response                    34               0                 1   

actual               Late Response  Very Late Response  
knn_pred                                                
Early Response                 907                 130  
Late Response                 1220                   8  
On Time Response              1318                 170  
Very Early Response           5514                 209  
Very Late Response              13                 857  
---
    

### Takeaways

- This model performs similarly compared to the baseline

- This model should not be used in future iterations

## Random Forest - Increased Max Depth & Min Samples / Best Features

- Next we moved on to Random Forest modeling

- The best version of this model we found was when we increased the max depth and min samples leaf paramaters and used feature engineering to determine the best features 

In [22]:
# make seperate df by removing incompatible features 
rf_train = X_train.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
rf_validate = X_validate.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
rf_test = X_test.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])

# make list of features to use
features = ['resolution_days_due', 'code_enforcement_services','customer_services',
                                       'development_services','solid_waste_management',
                                       'open_week']

In [23]:
#make the thing
rf = RandomForestClassifier(max_depth = 10, min_samples_leaf = 3, random_state=123)
#fit the thing
rf = rf.fit(rf_train[features], y_train.actual)
#predicitons
y_train['rf_pred'] = rf.predict(rf_train[features])
y_validate['rf_pred'] = rf.predict(rf_validate[features])
y_test['rf_pred'] = rf.predict(rf_test[features])

In [24]:
# calculate accuracy of rf on train
rf_accuracy_train = accuracy_score(y_train.actual, y_train.rf_pred)                                     
print(' RF Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.rf_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.rf_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.rf_pred))

 RF Accuracy: 65.77699%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rf_pred                                                                      
Early Response                      1295           10237              4126   
Late Response                        285             124               226   
On Time Response                     354             467              5602   
Very Early Response               112343           26143             14462   
Very Late Response                     3               0                 1   

actual               Late Response  Very Late Response  
rf_pred                                                 
Early Response                1400                 328  
Late Response                 1640                  12  
On Time Response               991                 352  
Very Early Response          16902                 908  
Very Late Response               2                1607  
---
     

In [25]:
# calculate accuracy of rf on validate
rf_accuracy_validate = accuracy_score(y_validate.actual, y_validate.rf_pred)                                     
print(' RF Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.rf_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.rf_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.rf_pred))

 RF Accuracy: 65.70014%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rf_pred                                                                      
Early Response                       616            4514              1782   
Late Response                        126              51                83   
On Time Response                     192             225              2332   
Very Early Response                48043           11055              6267   
Very Late Response                     1               0                 0   

actual               Late Response  Very Late Response  
rf_pred                                                 
Early Response                 653                 137  
Late Response                  698                   6  
On Time Response               444                 137  
Very Early Response           7177                 420  
Very Late Response               0                 674  
---
     

### Takeaways

- This model appears to improve on the baseline by about 9%

## Ridge Classifier - Default Parameters / Selected Features 

- Next we looked into the Ridge Classifier model

- The best version of this model we found was using default parameters and our selected features 

In [26]:
#make the thing
rc = RidgeClassifier(random_state=123)
#fit the thing
rc = rc.fit(train_scaled, y_train.actual)
#predicitons
y_train['rc_pred'] = rc.predict(train_scaled)
y_validate['rc_pred'] = rc.predict(validate_scaled)
y_test['rc_pred'] = rc.predict(test_scaled)

In [27]:
# calculate accuracy of rc on train
rc_accuracy_train = accuracy_score(y_train.actual, y_train.rc_pred)                                     
print(' RC Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.rc_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.rc_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.rc_pred))

 RC Accuracy: 58.30989%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rc_pred                                                                      
Early Response                        82              57                37   
Late Response                        197              52                34   
On Time Response                      88             113               208   
Very Early Response               113910           36749             24138   
Very Late Response                     3               0                 0   

actual               Late Response  Very Late Response  
rc_pred                                                 
Early Response                  25                   6  
Late Response                  727                 327  
On Time Response               123                  50  
Very Early Response          20056                1217  
Very Late Response               4                1607  
---
     

In [28]:
# calculate accuracy of rc on validate
rc_accuracy_validate = accuracy_score(y_validate.actual, y_validate.rc_pred)                                     
print(' RC Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.rc_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.rc_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.rc_pred))

 RC Accuracy: 58.37936%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rc_pred                                                                      
Early Response                        30              22                19   
Late Response                         61              17                 8   
On Time Response                      45              52               102   
Very Early Response                48842           15754             10335   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
rc_pred                                                 
Early Response                  13                   1  
Late Response                  342                 144  
On Time Response                55                  27  
Very Early Response           8562                 518  
Very Late Response               0                 684  
---
     

### Takeaways

- This model only improves on the baseline by about 2%

## SGD Classifier - Penalty Adjusted to l1 - Selected Features 

- Lastly we attempted to used an SGD Classifier model

- The version of this model we found to be the best was adjusting the penalty to l1 and using our selected features 

In [29]:
#make the thing
sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=123, penalty='l1')
#fit the thing
sgd = sgd.fit(train_scaled, y_train.actual)
#predicitons
y_train['sgd_pred'] = sgd.predict(train_scaled)
y_validate['sgd_pred'] = sgd.predict(validate_scaled)
y_test['sgd_pred'] = sgd.predict(test_scaled)

In [30]:
# calculate accuracy of sgd on train
sgd_accuracy_train = accuracy_score(y_train.actual, y_train.sgd_pred)                                     
print(' SGD Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.sgd_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.sgd_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.sgd_pred))

 SGD Accuracy: 58.83990%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
sgd_pred                                                                     
Early Response                       136             107               108   
Late Response                        498             275               366   
On Time Response                     578             564              1177   
Very Early Response               112871           35794             22463   
Very Late Response                   197             231               303   

actual               Late Response  Very Late Response  
sgd_pred                                                
Early Response                  36                   1  
Late Response                  765                  37  
On Time Response               546                  44  
Very Early Response          18985                 477  
Very Late Response             603                2648  
---
    

In [31]:
# calculate accuracy of sgd on validate
sgd_accuracy_validate = accuracy_score(y_validate.actual, y_validate.sgd_pred)                                     
print(' SGD Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.sgd_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.sgd_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.sgd_pred))

 SGD Accuracy: 58.97376%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
sgd_pred                                                                     
Early Response                        48              48                49   
Late Response                        171              96               139   
On Time Response                     236             241               506   
Very Early Response                48452           15352              9644   
Very Late Response                    71             108               126   

actual               Late Response  Very Late Response  
sgd_pred                                                
Early Response                  26                   2  
Late Response                  358                  13  
On Time Response               247                  10  
Very Early Response           8089                 212  
Very Late Response             252                1137  
---
    

### Takeaways

- This model only improved on the baseline by about 1%

# Evaluation 

- Train Accuracy

In [32]:
print(' Baseline Accuracy: {:.5%}'.format(baseline_accuracy))
print(' Log Accuracy: {:.5%}'.format(log_accuracy_train))
print(' DT Accuracy: {:.5%}'.format(dt_accuracy_train))
print(' KNN Accuracy: {:.5%}'.format(knn_accuracy_train))
print(' RF Accuracy: {:.5%}'.format(rf_accuracy_train))
print(' RC Accuracy: {:.5%}'.format(rc_accuracy_train))
print(' SGD Accuracy: {:.5%}'.format(sgd_accuracy_train))

 Baseline Accuracy: 57.19433%
 Log Accuracy: 58.02813%
 DT Accuracy: 67.24638%
 KNN Accuracy: 59.51204%
 RF Accuracy: 65.77699%
 RC Accuracy: 58.30989%
 SGD Accuracy: 58.83990%


- Validate Accuracy

In [33]:
print(' Baseline Accuracy: {:.5%}'.format(baseline_accuracy))
print(' Log Accuracy: {:.5%}'.format(log_accuracy_validate))
print(' DT Accuracy: {:.5%}'.format(dt_accuracy_validate))
print(' KNN Accuracy: {:.5%}'.format(knn_accuracy_validate))
print(' RF Accuracy: {:.5%}'.format(rf_accuracy_validate))
print(' RC Accuracy: {:.5%}'.format(rc_accuracy_validate))
print(' SGD Accuracy: {:.5%}'.format(sgd_accuracy_validate))

 Baseline Accuracy: 57.19433%
 Log Accuracy: 58.01969%
 DT Accuracy: 67.33853%
 KNN Accuracy: 59.84842%
 RF Accuracy: 65.70014%
 RC Accuracy: 58.37936%
 SGD Accuracy: 58.97376%


### Takeaways

- The decision tree model performed best on unseed validate data, so we will use it on test data

In [34]:
# calculate accuracy of dt on validate
dt_accuracy_test = accuracy_score(y_test.actual, y_test.dt_pred)                                     
print(' DT Accuracy: {:.5%}'.format(accuracy_score(y_test.actual, y_test.dt_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_test.dt_pred, y_test.actual))
print('---')
print(classification_report(y_test.actual, y_test.dt_pred))

 DT Accuracy: 67.23140%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
dt_pred                                                                      
Early Response                       552            3865              1641   
Late Response                         84             135               230   
On Time Response                       0               0              1578   
Very Early Response                40179            9204              5271   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
dt_pred                                                 
Early Response                 592                 116  
Late Response                 1614                 139  
On Time Response               198                  48  
Very Early Response           5072                 101  
Very Late Response               1                 741  
---
     

In [35]:
print(' Baseline Accuracy: {:.5%}'.format(baseline_accuracy))
print(' DT Accuracy: {:.5%}'.format(dt_accuracy_test))

 Baseline Accuracy: 57.19433%
 DT Accuracy: 67.23140%


### Takeaway

- The final model performs better than baseline by about 10% 

- While this is an improvement there is still room for improvement in future iterations

### Cache data

In [36]:
dt_report = classification_report(y_test.actual, y_test.dt_pred, output_dict=True)

In [37]:
dt_report

{'Early Response': {'precision': 0.5712385456695241,
  'recall': 0.2927143289912148,
  'f1-score': 0.3870806209313971,
  'support': 13204},
 'Late Response': {'precision': 0.7329700272479565,
  'recall': 0.21586197672863447,
  'f1-score': 0.33350552743051964,
  'support': 7477},
 'On Time Response': {'precision': 0.8651315789473685,
  'recall': 0.18096330275229358,
  'f1-score': 0.29931714719271624,
  'support': 8720},
 'Very Early Response': {'precision': 0.671586407474886,
  'recall': 0.984417493568541,
  'f1-score': 0.7984539257963873,
  'support': 40815},
 'Very Late Response': {'precision': 0.9986522911051213,
  'recall': 0.6471615720524018,
  'f1-score': 0.7853736089030207,
  'support': 1145},
 'accuracy': 0.6723140090525638,
 'macro avg': {'precision': 0.7679157700889713,
  'recall': 0.4642237348186171,
  'f1-score': 0.5207461660508083,
  'support': 71361},
 'weighted avg': {'precision': 0.6883487354938588,
  'recall': 0.6723140090525638,
  'f1-score': 0.6124189492085335,
  'sup

In [38]:
report_df = pd.DataFrame(dt_report).transpose()

In [39]:
report_df

Unnamed: 0,precision,recall,f1-score,support
Early Response,0.571239,0.292714,0.387081,13204.0
Late Response,0.73297,0.215862,0.333506,7477.0
On Time Response,0.865132,0.180963,0.299317,8720.0
Very Early Response,0.671586,0.984417,0.798454,40815.0
Very Late Response,0.998652,0.647162,0.785374,1145.0
accuracy,0.672314,0.672314,0.672314,0.672314
macro avg,0.767916,0.464224,0.520746,71361.0
weighted avg,0.688349,0.672314,0.612419,71361.0


In [40]:
report_df.to_csv('dt_test_report.csv')

In [41]:
dt_report = classification_report(y_validate.actual, y_validate.dt_pred, output_dict=True)

report_df = pd.DataFrame(dt_report).transpose()

report_df.to_csv('dt_validate_report.csv')

In [42]:
rf_report = classification_report(y_validate.actual, y_validate.rf_pred, output_dict=True)

report_df = pd.DataFrame(rf_report).transpose()

report_df.to_csv('rf_validate_report.csv')