- environment setup

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
import wrangle

from wrangle import clean_311

import model

from model import split_separate_scale

In [4]:
def keep_info(df):
    df.drop(df.columns.difference(['dept','call_reason', 'source_id',
                                   'council_district', 'resolution_days_due', 'level_of_delay']), 1, inplace=True)
    return df

# Model Prep

def dummy_dept(df):
    # dummy dept feature
    dummy_df =  pd.get_dummies(df['dept'])
    # Name the new columns
    dummy_df.columns = ['animal_care_services', 'code_enforcement_services', 
                        'customer_services', 'development_services', 
                        'metro_health', 'parks_and_rec',
                        'solid_waste_management', 'trans_and_cap_improvements', 
                        'unknown_dept']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df
    
def dummy_call_reason(df):
    # dummy dept feature
    dummy_df =  pd.get_dummies(df['call_reason'])
    # Name the new columns
    dummy_df.columns = ['buildings', 'business', 'cleanup', 'code',
                        'customer_service', 'field', 'land',
                        'license', 'misc', 'storm', 'streets', 'trades', 
                        'traffic', 'waste']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

def make_source_id_dummies(df):
    '''This function takes in the cleaned dataframe, makes dummy variables of the source id column, readds the names of the
    dummy columns and returns the concatenated dummy dataframe to the original dataframe.'''
    #make dummies
    dummy_df = pd.get_dummies(df['source_id'])
    #add back column names
    dummy_df.columns = ['web_portal', '311_mobile_app', 
                        'constituent_call', 'interal_services_requests']
    # concatenate dummies to the cleaned data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

def create_dummies(df):
    '''This function creates dummy variables for Council Districts'''
    # set what we are going to create these dummies from
    dummy_df =  pd.get_dummies(df['council_district'])
    # Name the new columns
    dummy_df.columns = ['district_0', 'district_1', 'district_2', 
                        'district_3', 'district_4', 'district_5',
                        'district_6', 'district_7', 'district_8',
                        'district_9', 'district_10']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

def make_source_id_dummies(df):
    '''This function takes in the cleaned dataframe, makes dummy variables of the source id column, readds the names of the
    dummy columns and returns the concatenated dummy dataframe to the original dataframe.'''
    #make dummies
    dummy_df = pd.get_dummies(df['source_id'])
    #add back column names
    dummy_df.columns = ['web_portal', '311_mobile_app', 'constituent_call', 'internal_services_requests']
    # concatenate dummies to the cleaned data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df


def model_df():
    '''This function reads in the clean 311 dataframe, applies all of the above functions to prepare it for modeling. 
    The function then returns a cleaned dataframe ready for modeling.'''
    df= wrangle.clean_311(wrangle.get_311_data())
    df= keep_info(df)
    df= dummy_dept(df)
    df= dummy_call_reason(df)
    df= make_source_id_dummies(df)

    return df

In [5]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

- Wrangle 

In [6]:
# acquire data
df = pd.read_csv('allservicecalls.csv')

In [7]:
df.head()

Unnamed: 0,Category,CASEID,OPENEDDATETIME,SLA_Date,CLOSEDDATETIME,Late (Yes/No),Dept,REASONNAME,TYPENAME,CaseStatus,SourceID,OBJECTDESC,Council District,XCOORD,YCOORD,Report Starting Date,Report Ending Date
0,Graffiti,1010444245,2012-08-15T00:00:00,2012-08-30T00:00:00,,YES,Code Enforcement Services,Graffiti,Graffiti Public Property,Open,Web Portal,"600 NOGALITOS ST, San Antonio, 78204",5,2125683.0,13695548.0,2020-05-15T00:00:00,2021-05-15T00:00:00
1,Property Maintenance,1010888252,2013-06-06T00:00:00,2013-08-09T00:00:00,,YES,Code Enforcement Services,Code Enforcement (IntExp),Alley-Way Maintenance,Open,Web Portal,"6043 CASTLE QUEEN, San Antonio, 78218",2,2169702.0,13725769.0,2020-05-15T00:00:00,2021-05-15T00:00:00
2,Property Maintenance,1010966128,2013-07-19T00:00:00,2013-09-23T00:00:00,,YES,Code Enforcement Services,Code Enforcement (IntExp),Junk Vehicle On Private Property,Open,Web Portal,"842 KIRK PL, San Antonio, 78226",5,2116192.0,13692260.0,2020-05-15T00:00:00,2021-05-15T00:00:00
3,Property Maintenance,1011052825,2013-09-16T00:00:00,2013-09-30T00:00:00,,YES,Code Enforcement Services,Code Enforcement,Right Of Way/Sidewalk Obstruction,Open,Internal Services Requests,"54 KENROCK RIDGE, San Antonio, 78254",7,2082242.0,13737817.0,2020-05-15T00:00:00,2021-05-15T00:00:00
4,Property Maintenance,1011052826,2013-09-16T00:00:00,2013-09-30T00:00:00,,YES,Code Enforcement Services,Code Enforcement,Right Of Way/Sidewalk Obstruction,Open,Internal Services Requests,"74 KENROCK RIDGE, San Antonio, 78254",7,2082389.0,13737877.0,2020-05-15T00:00:00,2021-05-15T00:00:00


In [8]:
df.shape

(495440, 17)

In [9]:
# clean data
df = clean_311(df)

In [10]:
model_df = keep_info(df)

In [11]:
model_df = dummy_dept(model_df)

In [12]:
model_df = make_source_id_dummies(model_df)

In [13]:
model_f = dummy_call_reason(model_df)

In [14]:
model_df = create_dummies(model_df)

In [15]:
model_df = model_df.drop(columns = ['dept', 'call_reason', 'council_district', 'source_id'])

In [16]:
def create_model_df(df):
    
    '''
    This function will take in a cleaned dataframe and return a dataframe prepared for numeric based modeling
    '''
    
    # only keep chosen variable
    model_df = keep_info(df)
    
    # make dummies of chosen variables
    model_df = dummy_dept(model_df)
    model_df = make_source_id_dummies(model_df)
    model_f = dummy_call_reason(model_df)
    model_df = create_dummies(model_df)
    
    # drop categorical columns once they've been dummied
    model_df = model_df.drop(columns = ['dept', 'call_reason', 'council_district', 'source_id'])
    
    return model_df

In [42]:
model_df2 = create_model_df(df)

In [43]:
model_df2.head()

Unnamed: 0,resolution_days_due,level_of_delay,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
2497,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2498,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2499,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2556,933,Very Early Response,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2657,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [17]:
model_df.head()

Unnamed: 0,resolution_days_due,level_of_delay,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
2497,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2498,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2499,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2556,933,Very Early Response,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2657,131,Extremely Late Response,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [18]:
model_df.shape

(428870, 26)

In [19]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428870 entries, 2497 to 495432
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   resolution_days_due         428870 non-null  int64   
 1   level_of_delay              428870 non-null  category
 2   animal_care_services        428870 non-null  uint8   
 3   code_enforcement_services   428870 non-null  uint8   
 4   customer_services           428870 non-null  uint8   
 5   development_services        428870 non-null  uint8   
 6   metro_health                428870 non-null  uint8   
 7   parks_and_rec               428870 non-null  uint8   
 8   solid_waste_management      428870 non-null  uint8   
 9   trans_and_cap_improvements  428870 non-null  uint8   
 10  unknown_dept                428870 non-null  uint8   
 11  web_portal                  428870 non-null  uint8   
 12  311_mobile_app              428870 non-null  uint8   
 

In [20]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = split_separate_scale(model_df)

In [21]:
train.head()

Unnamed: 0,resolution_days_due,level_of_delay,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
260102,5,Early Response,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
202138,7,Early Response,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
210324,2,Early Response,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
182502,64,Early Response,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
219960,4,Early Response,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [22]:
train.shape

(240167, 26)

In [23]:
train_scaled.shape

(240167, 25)

In [24]:
y_train.shape

(240167,)

# Establish Baseline

In [25]:
# look at values of target variable

y_train.value_counts()

Early Response              181108
On Time Response             33203
Very Early Response          17279
Late Response                 5839
Very Late Response            2021
Extremely Late Response        635
Extremely Early Response        82
Name: level_of_delay, dtype: int64

In [26]:
# set up as dataframes
y_train = pd.DataFrame(dict(actual=y_train))
y_validate = pd.DataFrame(dict(actual=y_validate))
y_test = pd.DataFrame(dict(actual=y_test))

In [27]:
# 'Early Response' is by far the most frequent value so that will be our baseline
y_train['baseline'] = 'Early Response'

In [28]:
y_train.head()

Unnamed: 0,actual,baseline
260102,Early Response,Early Response
202138,Early Response,Early Response
210324,Early Response,Early Response
182502,Early Response,Early Response
219960,Early Response,Early Response


In [29]:
# calculate accuracy of baseline

print(' Baseline Accuracy: {:.2%}'.format(accuracy_score(y_train.actual, y_train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.baseline, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.baseline))

 Baseline Accuracy: 75.41%
---
Confusion Matrix
actual          Extremely Late Response  Very Late Response  Late Response  \
baseline                                                                     
Early Response                      635                2021           5839   

actual          On Time Response  Early Response  Very Early Response  \
baseline                                                                
Early Response             33203          181108                17279   

actual          Extremely Early Response  
baseline                                  
Early Response                        82  
---
                          precision    recall  f1-score   support

          Early Response       0.75      1.00      0.86    181108
Extremely Early Response       0.00      0.00      0.00        82
 Extremely Late Response       0.00      0.00      0.00       635
           Late Response       0.00      0.00      0.00      5839
        On Time Response       0.

#### Takeaways

- Baseline of 'Early Response' is correct about 75% of the time

- This could be a difficult baseline to beat 

# Modeling

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240167 entries, 260102 to 33841
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   resolution_days_due         240167 non-null  int64   
 1   level_of_delay              240167 non-null  category
 2   animal_care_services        240167 non-null  uint8   
 3   code_enforcement_services   240167 non-null  uint8   
 4   customer_services           240167 non-null  uint8   
 5   development_services        240167 non-null  uint8   
 6   metro_health                240167 non-null  uint8   
 7   parks_and_rec               240167 non-null  uint8   
 8   solid_waste_management      240167 non-null  uint8   
 9   trans_and_cap_improvements  240167 non-null  uint8   
 10  unknown_dept                240167 non-null  uint8   
 11  web_portal                  240167 non-null  uint8   
 12  311_mobile_app              240167 non-null  uint8   


In [31]:
train_scaled.head()

Unnamed: 0,resolution_days_due,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
0,0.009381,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.013133,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.003752,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.120075,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.007505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [32]:
train.head()

Unnamed: 0,resolution_days_due,level_of_delay,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
260102,5,Early Response,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
202138,7,Early Response,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
210324,2,Early Response,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
182502,64,Early Response,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
219960,4,Early Response,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240167 entries, 260102 to 33841
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   resolution_days_due         240167 non-null  int64   
 1   level_of_delay              240167 non-null  category
 2   animal_care_services        240167 non-null  uint8   
 3   code_enforcement_services   240167 non-null  uint8   
 4   customer_services           240167 non-null  uint8   
 5   development_services        240167 non-null  uint8   
 6   metro_health                240167 non-null  uint8   
 7   parks_and_rec               240167 non-null  uint8   
 8   solid_waste_management      240167 non-null  uint8   
 9   trans_and_cap_improvements  240167 non-null  uint8   
 10  unknown_dept                240167 non-null  uint8   
 11  web_portal                  240167 non-null  uint8   
 12  311_mobile_app              240167 non-null  uint8   


In [34]:
train.shape

(240167, 26)

### Model 1: Logistic Regression - Default Parameters / Chosen Features

In [35]:
# make logistic regression object
log1 = LogisticRegression(random_state=123)

In [36]:
# fit logistic regression object to data
log1.fit(train_scaled, y_train.actual)

LogisticRegression(random_state=123)

In [37]:
# evaluate feature importance
print('Coefficient: \n', log1.coef_)
print('Intercept: \n', log1.intercept_)

Coefficient: 
 [[-1.23543497e+01  2.39244641e+00 -4.47192771e-01 -6.94894069e+00
   1.08975441e+00  1.72548008e+00 -2.98826524e-01  2.59754941e+00
   1.61009391e+00  9.91813388e-01  1.51115278e+00  1.34613243e+00
  -1.93105784e+00  1.78595025e+00 -4.22683347e-01  5.48818836e-01
   1.29659887e-01  1.49214015e-01  1.51411960e-01  1.93774954e-01
   1.20848965e-01  4.98079090e-01  5.38282167e-01  3.81474911e-01
   4.23296185e-01]
 [ 9.56475406e+00 -9.43619012e-01 -1.97230802e-01 -8.23381718e-02
  -2.69045033e+00 -4.99823372e-02 -1.78063064e-04 -1.20778724e+00
  -5.45824051e-01  2.50901767e+00 -1.52975826e+00 -1.60208227e+00
  -9.86730820e-02  2.21212850e-02 -1.91920954e-01 -6.62795984e-01
  -2.11416337e-01 -9.91546650e-01 -7.61323673e-02  3.97422605e-01
   2.77173000e-01 -3.73705339e-01 -3.64319327e-01 -6.72520680e-01
  -3.38630297e-01]
 [-5.63426717e-01 -1.56000554e+00 -7.48216151e-02  5.30719821e+00
  -2.59337610e+00 -7.55040258e-02 -4.29706853e-03  2.03509371e+00
  -1.02615396e+00 -2.47

In [38]:
# make predictions
y_train['log1_pred'] = log1.predict(train_scaled)
y_validate['log1_pred'] = log1.predict(validate_scaled)
y_test['log1_pred'] = log1.predict(test_scaled)

In [39]:
y_train.head()

Unnamed: 0,actual,baseline,log1_pred
260102,Early Response,Early Response,Early Response
202138,Early Response,Early Response,Early Response
210324,Early Response,Early Response,Early Response
182502,Early Response,Early Response,Early Response
219960,Early Response,Early Response,Early Response


- Evaluate Model

In [40]:
# calculate accuracy of log1 on train

print(' Log1 Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log1_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log1_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log1_pred))

 Log1 Accuracy: 81.76352%
---
Confusion Matrix
actual                   Extremely Late Response  Very Late Response  \
log1_pred                                                              
Early Response                                56                1208   
Extremely Late Response                      475                 288   
Late Response                                 46                  87   
On Time Response                               0                   2   
Very Early Response                           16                 377   
Very Late Response                            42                  59   

actual                   Late Response  On Time Response  Early Response  \
log1_pred                                                                  
Early Response                    4206             32415          178745   
Extremely Late Response              0                 0               0   
Late Response                      195                51              78

In [41]:
# calculate accuracy of log1 on validate

print(' Log1 Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log1_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log1_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log1_pred))

 Log1 Accuracy: 81.74178%
---
Confusion Matrix
actual                   Extremely Late Response  Very Late Response  \
log1_pred                                                              
Early Response                                26                 517   
Extremely Late Response                      202                 127   
Late Response                                 23                  41   
On Time Response                               0                   3   
Very Early Response                            4                 160   
Very Late Response                            17                  18   

actual                   Late Response  On Time Response  Early Response  \
log1_pred                                                                  
Early Response                    1828             13884           76611   
Extremely Late Response              0                 0               0   
Late Response                       68                13              24

#### Takeaways

- This model performs moderately better than baseline

- performs worse on validate than on train, but the difference in negligable 

### Model 2: Logistic Regresson - Balanced Weight / Chosen Features 

In [44]:
# make logistic regression object
log2 = LogisticRegression(class_weight='balanced', random_state=123)

In [45]:
# fit logistic regression object to data
log2.fit(train_scaled, y_train.actual)

LogisticRegression(class_weight='balanced', random_state=123)

In [47]:
#evaluate feature importance
print('Coefficient: \n', log2.coef_)
print('Intercept: \n', log2.intercept_)

Coefficient: 
 [[-1.44761479e+01  2.63502423e+00 -1.50951159e+00 -3.88871377e+00
   3.49913142e-01  2.48129071e+00 -2.83231453e-01  2.26071990e+00
   7.93079131e-01 -6.75782354e-02  2.20139840e+00  8.89487579e-01
  -2.14976254e+00  1.82986864e+00 -5.94731307e-01  3.88821391e-01
   2.21906175e-01  2.32955611e-01  1.95147884e-01  1.70927811e-01
   2.51167881e-01  5.74110413e-01  5.28594121e-01  5.05216593e-01
   2.96875500e-01]
 [ 3.31933500e+01 -2.52106862e+00 -1.36019099e+00 -1.77125679e+00
  -6.60511643e+00 -8.52016468e-02 -2.51868182e-03 -3.40151506e+00
  -1.03239215e+00  6.60035643e+00 -2.44587678e+00 -3.73041950e+00
  -8.70643966e-01 -3.13196369e+00 -4.69165465e-01 -4.75886534e-01
  -1.20843541e+00 -1.38650023e+00 -8.52169089e-01 -5.25787655e-02
  -7.97143396e-01 -1.07748665e+00 -1.05415177e+00 -2.05622032e+00
  -7.49166300e-01]
 [-2.69846035e+00 -4.75270016e+00  1.16058100e+00  9.06131504e+00
  -5.79107370e-01 -2.85971865e-01 -9.96969126e-02  2.23786311e+00
   1.16711921e-01 -5.74

In [48]:
# make predictions
y_train['log2_pred'] = log2.predict(train_scaled)
y_validate['log2_pred'] = log2.predict(validate_scaled)
y_test['log2_pred'] = log2.predict(test_scaled)

In [49]:
y_train.head()

Unnamed: 0,actual,baseline,log1_pred,log2_pred
260102,Early Response,Early Response,Early Response,Early Response
202138,Early Response,Early Response,Early Response,Early Response
210324,Early Response,Early Response,Early Response,Early Response
182502,Early Response,Early Response,Early Response,Late Response
219960,Early Response,Early Response,Early Response,Early Response


- Evaluate Model

In [51]:
# calculate accuracy of log2 on train

print(' Log2 Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log2_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log2_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log2_pred))

 Log2 Accuracy: 58.57716%
---
Confusion Matrix
actual                    Extremely Late Response  Very Late Response  \
log2_pred                                                               
Early Response                                 11                  78   
Extremely Early Response                        0                   0   
Extremely Late Response                       594                 430   
Late Response                                   0                 527   
On Time Response                                0                  40   
Very Early Response                            14                 318   
Very Late Response                             16                 628   

actual                    Late Response  On Time Response  Early Response  \
log2_pred                                                                   
Early Response                      630             13006          105333   
Extremely Early Response              0                 0       

In [52]:
# calculate accuracy of log2 on validate

print(' Log2 Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log2_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log2_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log2_pred))

 Log2 Accuracy: 58.64139%
---
Confusion Matrix
actual                    Extremely Late Response  Very Late Response  \
log2_pred                                                               
Early Response                                  7                  35   
Extremely Early Response                        0                   0   
Extremely Late Response                       254                 179   
Late Response                                   0                 239   
On Time Response                                0                  24   
Very Early Response                             2                 137   
Very Late Response                              9                 252   

actual                    Late Response  On Time Response  Early Response  \
log2_pred                                                                   
Early Response                      264              5677           45251   
Extremely Early Response              0                 0       

#### Takeaways 

- This model did very poorly compared to default parameters and does not beat the baseline prediction

### Model 3: Logistic Regression - Lower C / Chosen Features

In [53]:
# make logistic regression object
log3 = LogisticRegression(C = .1, random_state=123)

In [54]:
# fit logistic regression object to data
log3.fit(train_scaled, y_train.actual)

LogisticRegression(C=0.1, random_state=123)

In [55]:
#evaluate feature importance
print('Coefficient: \n', log3.coef_)
print('Intercept: \n', log3.intercept_)

Coefficient: 
 [[-9.23956832e+00  9.28261318e-01 -8.83674352e-01 -3.05745083e+00
   4.26855196e-01  7.97767346e-01 -1.57464460e-01  1.50030036e+00
   1.21508913e+00  6.03524042e-01  7.04854552e-01  7.34809404e-01
  -1.15264585e+00  1.08618965e+00 -4.67909532e-01  3.92942942e-01
  -9.93098800e-03  3.02229746e-02  6.97797699e-02  1.55946889e-01
   1.05349379e-01  3.01474329e-01  3.08308515e-01  2.30807559e-01
   2.56215917e-01]
 [ 4.68883289e+00 -4.52441757e-01 -1.52778628e-01 -4.07236505e-02
  -1.36618392e+00 -3.08597641e-02 -1.09898011e-04 -5.78276190e-01
  -3.61159384e-01  1.76197127e+00 -6.53868025e-01 -7.04070245e-01
  -7.01267449e-02  2.07503090e-01 -7.88053009e-02 -3.23148697e-01
  -1.19452638e-01 -4.89992250e-01 -5.23639133e-03  2.30662636e-01
   2.83939749e-01 -1.59726260e-01 -1.43648310e-01 -2.88972785e-01
  -1.26181679e-01]
 [-2.63343590e-01 -9.20416205e-01  4.70219723e-01  3.24091229e+00
  -1.80532390e+00 -4.13150224e-02 -1.54659691e-03  5.11206810e-01
  -5.11771776e-01 -1.20

In [56]:
# make predictions
y_train['log3_pred'] = log3.predict(train_scaled)
y_validate['log3_pred'] = log3.predict(validate_scaled)
y_test['log3_pred'] = log3.predict(test_scaled)

In [57]:
y_train.head()

Unnamed: 0,actual,baseline,log1_pred,log2_pred,log3_pred
260102,Early Response,Early Response,Early Response,Early Response,Early Response
202138,Early Response,Early Response,Early Response,Early Response,Early Response
210324,Early Response,Early Response,Early Response,Early Response,Early Response
182502,Early Response,Early Response,Early Response,Late Response,Early Response
219960,Early Response,Early Response,Early Response,Early Response,Early Response


- Evaluate Model

In [58]:
# calculate accuracy of log3 on train

print(' Log3 Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log3_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log3_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log3_pred))

 Log3 Accuracy: 81.75436%
---
Confusion Matrix
actual                   Extremely Late Response  Very Late Response  \
log3_pred                                                              
Early Response                                56                1249   
Extremely Late Response                      516                 324   
Late Response                                 46                  65   
Very Early Response                           16                 377   
Very Late Response                             1                   6   

actual                   Late Response  On Time Response  Early Response  \
log3_pred                                                                  
Early Response                    4255             32451          178831   
Extremely Late Response              0                 0               0   
Late Response                      152                24              19   
Very Early Response               1419               724            

In [59]:
# calculate accuracy of log3 on validate

print(' Log3 Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log3_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log3_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log3_pred))

 Log3 Accuracy: 81.73110%
---
Confusion Matrix
actual                   Extremely Late Response  Very Late Response  \
log3_pred                                                              
Early Response                                26                 532   
Extremely Late Response                      219                 136   
Late Response                                 23                  37   
Very Early Response                            4                 159   
Very Late Response                             0                   2   

actual                   Late Response  On Time Response  Early Response  \
log3_pred                                                                  
Early Response                    1846             13895           76641   
Extremely Late Response              0                 0               0   
Late Response                       53                 7               1   
Very Early Response                602               327            

#### Takeaways

- This model beats the baseline 

- Log1 beats this model by mere decimals on validate however

### Model 4: Logistic Regression - Lower C + Balanced Weight / Chosen Features 

In [60]:
# make logistic regression object
log4 = LogisticRegression(class_weight='balanced', C = .1, random_state=123)

In [61]:
# fit logistic regression object to data
log4.fit(train_scaled, y_train.actual)

LogisticRegression(C=0.1, class_weight='balanced', random_state=123)

In [62]:
#evaluate feature importance
print('Coefficient: \n', log4.coef_)
print('Intercept: \n', log4.intercept_)

Coefficient: 
 [[-9.76352868e+00  1.04710179e+00 -1.61665740e+00 -1.97988647e+00
  -1.65125855e-01  1.36885336e+00 -2.46241743e-01  1.37111887e+00
   8.68115500e-01  3.77057088e-01  9.61521684e-01  7.77769820e-02
  -1.00875067e+00  9.93787153e-01 -5.77135942e-01  2.75480414e-01
  -2.66426420e-03  4.98741348e-02  4.81514416e-02  6.03711812e-02
   7.63970455e-02  3.68626281e-01  3.33480479e-01  2.50581705e-01
   1.41172669e-01]
 [ 2.16693345e+01 -8.97359651e-01 -5.49616328e-01 -6.35958184e-01
  -2.48807598e+00 -2.98792480e-02 -8.91709993e-04 -1.21432781e+00
  -4.75909932e-01  3.04812239e+00 -9.23786013e-01 -1.53631831e+00
  -3.64671026e-01 -4.19121100e-01 -2.85549430e-01 -2.37815295e-01
  -3.76512821e-01 -7.13761620e-01 -1.70832053e-01  4.24497345e-01
   2.89313466e-02 -3.20697636e-01 -3.49558030e-01 -1.08727106e+00
  -1.55327201e-01]
 [-1.05943592e+00 -2.22261219e+00  1.13216475e+00  4.57279598e+00
  -1.03531012e+00 -1.97137671e-01 -6.83179706e-02  9.52148293e-01
   1.37172501e-02 -2.93

In [63]:
# make predictions
y_train['log4_pred'] = log4.predict(train_scaled)
y_validate['log4_pred'] = log4.predict(validate_scaled)
y_test['log4_pred'] = log4.predict(test_scaled)

In [64]:
y_train.head()

Unnamed: 0,actual,baseline,log1_pred,log2_pred,log3_pred,log4_pred
260102,Early Response,Early Response,Early Response,Early Response,Early Response,Early Response
202138,Early Response,Early Response,Early Response,Early Response,Early Response,Early Response
210324,Early Response,Early Response,Early Response,Early Response,Early Response,Early Response
182502,Early Response,Early Response,Early Response,Late Response,Early Response,Late Response
219960,Early Response,Early Response,Early Response,Early Response,Early Response,Early Response


- Evaluate Model

In [65]:
# calculate accuracy of log34on train

print(' Log4 Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log4_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log4_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log4_pred))

 Log4 Accuracy: 58.78451%
---
Confusion Matrix
actual                    Extremely Late Response  Very Late Response  \
log4_pred                                                               
Early Response                                 11                  79   
Extremely Early Response                        0                   0   
Extremely Late Response                       597                 432   
Late Response                                   0                 520   
On Time Response                                0                  39   
Very Early Response                            13                 327   
Very Late Response                             14                 624   

actual                    Late Response  On Time Response  Early Response  \
log4_pred                                                                   
Early Response                      678             12865          105738   
Extremely Early Response              6                39       

In [66]:
# calculate accuracy of log4 on validate

print(' Log4 Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log4_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log4_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log4_pred))

 Log4 Accuracy: 58.81821%
---
Confusion Matrix
actual                    Extremely Late Response  Very Late Response  \
log4_pred                                                               
Early Response                                  7                  36   
Extremely Early Response                        0                   0   
Extremely Late Response                       255                 181   
Late Response                                   0                 238   
On Time Response                                0                  23   
Very Early Response                             2                 139   
Very Late Response                              8                 249   

actual                    Late Response  On Time Response  Early Response  \
log4_pred                                                                   
Early Response                      290              5606           45416   
Extremely Early Response              2                19       

#### Takeaways

- This model performed very poorly, did not beat baseline

### Model 5: Logistic Regression - Adjusted Weight / Chosen Features

In [69]:
# make logistic regression object
log5 = LogisticRegression(C = 1, class_weight = {0:1, 1:99}, random_state=123)

In [72]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240167 entries, 260102 to 33841
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   actual     240167 non-null  category
 1   baseline   240167 non-null  object  
 2   log1_pred  240167 non-null  object  
 3   log2_pred  240167 non-null  object  
 4   log3_pred  240167 non-null  object  
 5   log4_pred  240167 non-null  object  
dtypes: category(1), object(5)
memory usage: 11.2+ MB


In [73]:
# fit logistic regression object to data
log5.fit(train_scaled, y_train.actual)

TypeError: '<' not supported between instances of 'str' and 'int'