- Environment Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.metrics import classification_report, accuracy_score

import wrangle

from wrangle import get_311_data, clean_311

import model

from model import split_separate_scale, model_df

pd.set_option("display.max_rows", None, "display.max_columns", None)

# Acquire and Prep

In [2]:
# Acquire and prep data
df = model_df()

In [3]:
df.head()

Unnamed: 0,open_date,dept,call_reason,source_id,council_district,resolution_days_due,level_of_delay,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste,web_portal,311_mobile_app,constituent_call,internal_services_requests,open_month,open_year,open_week
551,2017-01-27,Customer Service,customer_service,Constituent Call,2,8,Very Late Response,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2017,4
552,2017-01-27,Customer Service,customer_service,Constituent Call,4,8,Very Late Response,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2017,4
553,2017-02-03,Customer Service,customer_service,Constituent Call,2,8,Very Late Response,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2017,5
554,2017-02-03,Customer Service,customer_service,Constituent Call,1,8,Very Late Response,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2017,5
555,2017-02-28,Customer Service,customer_service,Constituent Call,6,8,Very Late Response,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,2017,9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 47 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   open_date                   356804 non-null  datetime64[ns]
 1   dept                        356804 non-null  object        
 2   call_reason                 356804 non-null  object        
 3   source_id                   356804 non-null  object        
 4   council_district            356804 non-null  int64         
 5   resolution_days_due         356804 non-null  int64         
 6   level_of_delay              356804 non-null  category      
 7   district_1                  356804 non-null  uint8         
 8   district_2                  356804 non-null  uint8         
 9   district_3                  356804 non-null  uint8         
 10  district_4                  356804 non-null  uint8         
 11  district_5                  356804 no

In [5]:
df.describe()

Unnamed: 0,council_district,resolution_days_due,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste,web_portal,311_mobile_app,constituent_call,internal_services_requests,open_month,open_year,open_week
count,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0,356804.0
mean,4.802188,32.784952,0.129236,0.132218,0.128648,0.09744,0.140629,0.085532,0.094158,0.054646,0.054728,0.082765,0.160419,0.004762,0.007803,0.262183,0.006603,6e-06,0.473114,0.030731,0.05438,0.000499,0.003756,0.000177,0.264523,0.015353,0.160606,0.004863,0.002214,0.057398,0.009591,0.028301,0.02519,0.017007,0.410522,0.108858,0.084085,0.001256,0.805801,6.345714,2020.316185,26.108614
std,2.780184,45.078669,0.335462,0.338728,0.33481,0.296556,0.347639,0.279671,0.292049,0.227289,0.227448,0.275528,0.366994,0.068841,0.087987,0.439822,0.080991,0.002368,0.499277,0.172589,0.226766,0.02233,0.061168,0.013287,0.44108,0.122952,0.367168,0.069563,0.047002,0.232603,0.097462,0.165832,0.156703,0.129296,0.491929,0.311462,0.277516,0.035412,0.395583,3.227659,0.516766,14.388064
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017.0,1.0
25%,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2020.0,14.0
50%,5.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,2020.0,26.0
75%,7.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9.0,2021.0,38.0
max,10.0,934.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,2021.0,53.0


In [6]:
df.isnull().sum()

open_date                     0
dept                          0
call_reason                   0
source_id                     0
council_district              0
resolution_days_due           0
level_of_delay                0
district_1                    0
district_2                    0
district_3                    0
district_4                    0
district_5                    0
district_6                    0
district_7                    0
district_8                    0
district_9                    0
district_10                   0
animal_care_services          0
code_enforcement_services     0
customer_services             0
development_services          0
metro_health                  0
parks_and_rec                 0
solid_waste_management        0
trans_and_cap_improvements    0
unknown_dept                  0
buildings                     0
business                      0
cleanup                       0
code                          0
customer_service              0
field   

# Modeling

## Split Data 

In [7]:
# Run split function
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = split_separate_scale(df)

## Establish Baseline

In [8]:
# look at values of target variable
y_train.value_counts()

Very Early Response    154252
Late Response           17032
Early Response          13609
On Time Response        11761
Very Late Response       3156
Name: level_of_delay, dtype: int64

In [9]:
# set up as dataframes
y_train = pd.DataFrame(dict(actual=y_train))
y_validate = pd.DataFrame(dict(actual=y_validate))
y_test = pd.DataFrame(dict(actual=y_test))

In [10]:
# 'Extremely Early Response' is by far the most frequent value so that will be our baseline
y_train['baseline'] = 'Very Early Response'

In [11]:
# calculate accuracy of baseline
baseline_accuracy = accuracy_score(y_train.actual, y_train.baseline)
print(' Baseline Accuracy: {:.2%}'.format(accuracy_score(y_train.actual, y_train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.baseline, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.baseline))

 Baseline Accuracy: 77.20%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
baseline                                                                     
Very Early Response               154252           13609             11761   

actual               Late Response  Very Late Response  
baseline                                                
Very Early Response          17032                3156  
---
                     precision    recall  f1-score   support

     Early Response       0.00      0.00      0.00     13609
      Late Response       0.00      0.00      0.00     17032
   On Time Response       0.00      0.00      0.00     11761
Very Early Response       0.77      1.00      0.87    154252
 Very Late Response       0.00      0.00      0.00      3156

           accuracy                           0.77    199810
          macro avg       0.15      0.20      0.17    199810
       weighted avg       0.60      0.77      0.67  

### Takeaways

- Baseline accuracy is 77.2%, which means we can assume approx 3/4 of cases are resolved very early

- Our goal will be to outperform this baseline

## Logistic Regression - Default Parameters / All Features

- The first model we looked at was logistic regression

- Various iterations were made, but the logistic regression model that worked the best turned out to be a model using default parameters and all features

In [12]:
# make logistic regression object
log = LogisticRegression(random_state=123)
# fit logistic regression object to data
log.fit(train_scaled, y_train.actual)
# calculate probability
log_pred_proba = log.predict_proba(train_scaled)
# make predictions
y_train['log_pred'] = log.predict(train_scaled)
y_validate['log_pred'] = log.predict(validate_scaled)
y_test['log_pred'] = log.predict(test_scaled)

In [13]:
# calculate accuracy of log on train
log_accuracy_train = accuracy_score(y_train.actual, y_train.log_pred)                                     
print(' log Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log_pred))

 log Accuracy: 78.81838%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
log_pred                                                                     
Early Response                        30              33                18   
Late Response                        169              84                71   
Very Early Response               154050           13492             11672   
Very Late Response                     3               0                 0   

actual               Late Response  Very Late Response  
log_pred                                                
Early Response                  11                   0  
Late Response                 1663                  80  
Very Early Response          15248                1335  
Very Late Response             110                1741  
---
                     precision    recall  f1-score   support

     Early Response       0.36      0.00      0.00     13609
      Late Respo

In [14]:
# calculate accuracy of log on validate
log_accuracy_validate = accuracy_score(y_validate.actual, y_validate.log_pred)
print(' Log Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log_pred))

 Log Accuracy: 78.79789%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
log_pred                                                                     
Early Response                        14               9                10   
Late Response                         71              22                33   
Very Early Response                66021            5802              4997   
Very Late Response                     2               0                 0   

actual               Late Response  Very Late Response  
log_pred                                                
Early Response                   4                   0  
Late Response                  702                  37  
Very Early Response           6543                 570  
Very Late Response              51                 745  
---
                     precision    recall  f1-score   support

     Early Response       0.24      0.00      0.00      5833
      Late Respo

### Takeaways

- 

## Decision Tree - Deeper Max Depth - Selected Features

- Next we looked at the Decision Tree model

- The best iteration of this model was found by doubling the max depth parameter as well as only incorporating features we found to be significant

In [15]:
# make seperate df by removing incompatible features 
dt_train = X_train.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
dt_validate = X_validate.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
dt_test = X_test.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])

In [16]:
#make the thing
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
#fit the thing
clf = clf.fit(dt_train, y_train.actual)
#predicitons
y_train['dt_pred'] = clf.predict(dt_train)
y_validate['dt_pred'] = clf.predict(dt_validate)
y_test['dt_pred'] = clf.predict(dt_test)

In [17]:
# calculate accuracy of dt on train
dt_accuracy_train = accuracy_score(y_train.actual, y_train.dt_pred)                                     
print(' DT Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.dt_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.dt_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.dt_pred))

 DT Accuracy: 80.50598%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
dt_pred                                                                      
Late Response                        367             404               393   
On Time Response                      37               1                48   
Very Early Response               153848           13204             11320   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
dt_pred                                                 
Late Response                 4261                  87  
On Time Response                14                   0  
Very Early Response          12722                 367  
Very Late Response              35                2702  
---
                     precision    recall  f1-score   support

     Early Response       0.00      0.00      0.00     13609
      Late Respon

In [18]:
# calculate accuracy of dt on validate
dt_accuracy_validate = accuracy_score(y_validate.actual, y_validate.dt_pred)                                     
print(' DT Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.dt_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.dt_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.dt_pred))

 DT Accuracy: 80.44329%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
dt_pred                                                                      
Late Response                        161             173               175   
On Time Response                      13               1                14   
Very Early Response                65934            5659              4851   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
dt_pred                                                 
Late Response                 1788                  39  
On Time Response                 3                   0  
Very Early Response           5496                 163  
Very Late Response              13                1150  
---
                     precision    recall  f1-score   support

     Early Response       0.00      0.00      0.00      5833
      Late Respon

### Takeaways

- 

## K Nearest Neighbors - Uniform Weight / Chosen Features

- Next we looked at KNN Models

- The model that performed the best was a model using uniform weight and selecting features that were found to be significant

In [19]:
#Features I want to train on
features1 = ['council_district',
 'resolution_days_due',
 'district_1',
 'district_2',
 'district_3',
 'district_4',
 'district_5',
 'district_6',
 'district_7',
 'district_8',
 'district_9',
 'district_10',
 'animal_care_services',
 'code_enforcement_services',
 'customer_services',
 'development_services',
 'metro_health',
 'parks_and_rec',
 'solid_waste_management',
 'trans_and_cap_improvements',
 'unknown_dept',
 'buildings',
 'business',
 'cleanup',
 'code',
 'customer_service',
 'field',
 'land',
 'license',
 'misc',
 'storm',
 'streets',
 'trades',
 'traffic',
 'waste',
 'web_portal',
 '311_mobile_app',
 'constituent_call',
 'internal_services_requests']
#make model
#weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
#fit model
knn.fit(X_train[features1], y_train.actual)
#predicitons
y_train['knn_pred'] = knn.predict(X_train[features1])
y_validate['knn_pred'] = knn.predict(X_validate[features1])
y_test['knn_pred'] = knn.predict(X_test[features1])
#estimate probability
knn_pred_proba = knn.predict_proba(X_train[features1])

In [20]:
# calculate accuracy of knn on train
knn_accuracy_train = accuracy_score(y_train.actual, y_train.knn_pred)                                     
print(' KNN Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.knn_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.knn_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.knn_pred))

 KNN Accuracy: 75.78249%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
knn_pred                                                                     
Early Response                      2761            1049               425   
Late Response                       4760             483               701   
On Time Response                    2359             473              1353   
Very Early Response               144279           11585              9272   
Very Late Response                    93              19                10   

actual               Late Response  Very Late Response  
knn_pred                                                
Early Response                 723                  56  
Late Response                 3009                  94  
On Time Response               558                   9  
Very Early Response          12719                1266  
Very Late Response              23                1731  
---
    

In [21]:
# calculate accuracy of knn on validate
knn_accuracy_validate = accuracy_score(y_validate.actual, y_validate.knn_pred)                                     
print(' KNN Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.knn_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.knn_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.knn_pred))

 KNN Accuracy: 75.14393%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
knn_pred                                                                     
Early Response                      1265             369               213   
Late Response                       2191             233               301   
On Time Response                    1071             214               512   
Very Early Response                61531            5011              4004   
Very Late Response                    50               6                10   

actual               Late Response  Very Late Response  
knn_pred                                                
Early Response                 309                  23  
Late Response                 1219                  43  
On Time Response               243                   4  
Very Early Response           5510                 565  
Very Late Response              19                 717  
---
    

### Takeaways

- 

## Random Forest - Increased Max Depth & Min Samples / Best Features

- Next we moved on to Random Forest modeling

- The best version of this model we found was when we increased the max depth and min samples leaf paramaters and used feature engineering to determine the best features 

In [22]:
# make seperate df by removing incompatible features 
rf_train = X_train.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
rf_validate = X_validate.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])
rf_test = X_test.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district', 'open_date'])

# make list of features to use
features = ['resolution_days_due', 'code_enforcement_services','customer_services',
                                       'development_services','solid_waste_management',
                                       'open_week']

In [23]:
#make the thing
rf = RandomForestClassifier(max_depth = 10, min_samples_leaf = 3, random_state=123)
#fit the thing
rf = rf.fit(rf_train[features], y_train.actual)
#predicitons
y_train['rf_pred'] = rf.predict(rf_train[features])
y_validate['rf_pred'] = rf.predict(rf_validate[features])
y_test['rf_pred'] = rf.predict(rf_test[features])

In [24]:
# calculate accuracy of rf on train
rf_accuracy_train = accuracy_score(y_train.actual, y_train.rf_pred)                                     
print(' RF Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.rf_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.rf_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.rf_pred))

 RF Accuracy: 78.60517%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rf_pred                                                                      
Early Response                         0              20                 1   
Late Response                        224              60                59   
On Time Response                     433             177               779   
Very Early Response               153593           13352             10922   
Very Late Response                     2               0                 0   

actual               Late Response  Very Late Response  
rf_pred                                                 
Early Response                   5                   0  
Late Response                 1082                   3  
On Time Response               164                   4  
Very Early Response          15779                1562  
Very Late Response               2                1587  
---
     

In [25]:
# calculate accuracy of rf on validate
rf_accuracy_validate = accuracy_score(y_validate.actual, y_validate.rf_pred)                                     
print(' RF Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.rf_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.rf_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.rf_pred))

 RF Accuracy: 78.54916%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rf_pred                                                                      
Early Response                         2               4                 1   
Late Response                        107              24                21   
On Time Response                     198              74               344   
Very Early Response                65801            5731              4674   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
rf_pred                                                 
Early Response                   1                   0  
Late Response                  446                   1  
On Time Response                69                   2  
Very Early Response           6783                 680  
Very Late Response               1                 669  
---
     

### Takeaways

- 

## Ridge Classifier - Default Parameters / Selected Features 

- Next we looked into the Ridge Classifier model

- The best version of this model we found was using default parameters and our selected features 

In [26]:
#make the thing
rc_cv = RidgeClassifierCV()
#fit the thing
rc_cv = rc_cv.fit(train_scaled, y_train.actual)
#predicitons
y_train['rc_cv_pred'] = rc_cv.predict(train_scaled)
y_validate['rc_cv_pred'] = rc_cv.predict(validate_scaled)
y_test['rc_cv_pred'] = rc_cv.predict(test_scaled)

In [27]:
# calculate accuracy of rc_cv on train
rc_cv_accuracy_train = accuracy_score(y_train.actual, y_train.rc_cv_pred)                                     
print(' RC_CV Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.rc_cv_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.rc_cv_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.rc_cv_pred))

 RC_CV Accuracy: 78.22331%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rc_cv_pred                                                                   
Early Response                        21              21                11   
Late Response                        203              53                28   
Very Early Response               154028           13535             11722   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
rc_cv_pred                                              
Early Response                   6                   0  
Late Response                  694                  42  
Very Early Response          16332                1559  
Very Late Response               0                1555  
---
                     precision    recall  f1-score   support

     Early Response       0.36      0.00      0.00     13609
      Late Res

In [28]:
# calculate accuracy of rc_cv on validate
rc_cv_accuracy_validate = accuracy_score(y_validate.actual, y_validate.rc_cv_pred)                                     
print(' RC_CV Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.rc_cv_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.rc_cv_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.rc_cv_pred))

 RC_CV Accuracy: 78.20817%
---
Confusion Matrix
actual               Very Early Response  Early Response  On Time Response  \
rc_cv_pred                                                                   
Early Response                         9               6                 6   
Late Response                         83              11                12   
Very Early Response                66016            5816              5022   
Very Late Response                     0               0                 0   

actual               Late Response  Very Late Response  
rc_cv_pred                                              
Early Response                   2                   0  
Late Response                  292                  18  
Very Early Response           7006                 676  
Very Late Response               0                 658  
---
                     precision    recall  f1-score   support

     Early Response       0.26      0.00      0.00      5833
      Late Res

### Takeaways

- 

## SGD Classifier - Penalty Adjusted to Elastic - Selected Features 

- Lastly we attempted to used an SGD Classifier model

- The version of this model we found to be the best was adjusting the penalty to elastic and using our selected features 

In [None]:
#make the thing
sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=123, penalty='elasticnet')
#fit the thing
sgd = sgd.fit(train_scaled, y_train.actual)
#predicitons
y_train['sgd_pred'] = sgd.predict(train_scaled)
y_validate['sgd_pred'] = sgd.predict(validate_scaled)
y_test['sgd_pred'] = sgd.predict(test_scaled)

In [None]:
# calculate accuracy of sgd on train
sgd_accuracy_train = accuracy_score(y_train.actual, y_train.sgd_pred)                                     
print(' SGD Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.sgd_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.sgd_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.sgd_pred))

In [None]:
# calculate accuracy of sgd on validate
sgd_accuracy_validate = accuracy_score(y_validate.actual, y_validate.sgd_pred)                                     
print(' SGD Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.sgd_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.sgd_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.sgd_pred))

### Takeaways

- 

# Evaluation 

- Train Accuracy

In [None]:
print(' Baseline Accuracy: {:.5%}'.format(baseline_accuracy))
print(' Log Accuracy: {:.5%}'.format(log_accuracy_train))
print(' DT Accuracy: {:.5%}'.format(dt_accuracy_train))
print(' KNN Accuracy: {:.5%}'.format(knn_accuracy_train))
print(' RF Accuracy: {:.5%}'.format(rf_accuracy_train))
print(' RC_CV Accuracy: {:.5%}'.format(rc_cv_accuracy_train))
print(' SGD Accuracy: {:.5%}'.format(sgd_accuracy_train))

- Validate Accuracy

In [None]:
print(' Baseline Accuracy: {:.5%}'.format(baseline_accuracy))
print(' Log Accuracy: {:.5%}'.format(log_accuracy_validate))
print(' DT Accuracy: {:.5%}'.format(dt_accuracy_validate))
print(' KNN Accuracy: {:.5%}'.format(knn_accuracy_validate))
print(' RF Accuracy: {:.5%}'.format(rf_accuracy_validate))
print(' RC_CV Accuracy: {:.5%}'.format(rc_cv_accuracy_validate))
print(' SGD Accuracy: {:.5%}'.format(sgd_accuracy_validate))