- environment setup

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
import wrangle

from wrangle import clean_311, split_separate_scale

In [4]:
def keep_info(df):
    df.drop(df.columns.difference(['dept','call_reason', 'source_id',
                                   'council_district', 'resolution_days_due']), 1, inplace=True)
    return df

# Model Prep

def dummy_dept(df):
    # dummy dept feature
    dummy_df =  pd.get_dummies(df['dept'])
    # Name the new columns
    dummy_df.columns = ['animal_care_services', 'code_enforcement_services', 
                        'customer_services', 'development_services', 
                        'metro_health', 'parks_and_rec',
                        'solid_waste_management', 'trans_and_cap_improvements', 
                        'unknown_dept']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df
    
def dummy_call_reason(df):
    # dummy dept feature
    dummy_df =  pd.get_dummies(df['call_reason'])
    # Name the new columns
    dummy_df.columns = ['buildings', 'business', 'cleanup', 'code',
                        'customer_service', 'field', 'land',
                        'license', 'misc', 'storm', 'streets', 'trades', 
                        'traffic', 'waste']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

def make_source_id_dummies(df):
    '''This function takes in the cleaned dataframe, makes dummy variables of the source id column, readds the names of the
    dummy columns and returns the concatenated dummy dataframe to the original dataframe.'''
    #make dummies
    dummy_df = pd.get_dummies(df['source_id'])
    #add back column names
    dummy_df.columns = ['web_portal', '311_mobile_app', 
                        'constituent_call', 'interal_services_requests']
    # concatenate dummies to the cleaned data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

def create_dummies(df):
    '''This function creates dummy variables for Council Districts'''
    # set what we are going to create these dummies from
    dummy_df =  pd.get_dummies(df['council_district'])
    # Name the new columns
    dummy_df.columns = ['district_0', 'district_1', 'district_2', 
                        'district_3', 'district_4', 'district_5',
                        'district_6', 'district_7', 'district_8',
                        'district_9', 'district_10']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

def make_source_id_dummies(df):
    '''This function takes in the cleaned dataframe, makes dummy variables of the source id column, readds the names of the
    dummy columns and returns the concatenated dummy dataframe to the original dataframe.'''
    #make dummies
    dummy_df = pd.get_dummies(df['source_id'])
    #add back column names
    dummy_df.columns = ['web_portal', '311_mobile_app', 'constituent_call', 'internal_services_requests']
    # concatenate dummies to the cleaned data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df


def model_df():
    '''This function reads in the clean 311 dataframe, applies all of the above functions to prepare it for modeling. 
    The function then returns a cleaned dataframe ready for modeling.'''
    df= wrangle.clean_311(wrangle.get_311_data())
    df= keep_info(df)
    df= dummy_dept(df)
    df= dummy_call_reason(df)
    df= make_source_id_dummies(df)

    return df

In [5]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

- Wrangle 

In [6]:
# acquire data
df = pd.read_csv('allservicecalls.csv')

In [7]:
df.head()

Unnamed: 0,Category,CASEID,OPENEDDATETIME,SLA_Date,CLOSEDDATETIME,Late (Yes/No),Dept,REASONNAME,TYPENAME,CaseStatus,SourceID,OBJECTDESC,Council District,XCOORD,YCOORD,Report Starting Date,Report Ending Date
0,Graffiti,1010444245,2012-08-15T00:00:00,2012-08-30T00:00:00,,YES,Code Enforcement Services,Graffiti,Graffiti Public Property,Open,Web Portal,"600 NOGALITOS ST, San Antonio, 78204",5,2125683.0,13695548.0,2020-05-15T00:00:00,2021-05-15T00:00:00
1,Property Maintenance,1010888252,2013-06-06T00:00:00,2013-08-09T00:00:00,,YES,Code Enforcement Services,Code Enforcement (IntExp),Alley-Way Maintenance,Open,Web Portal,"6043 CASTLE QUEEN, San Antonio, 78218",2,2169702.0,13725769.0,2020-05-15T00:00:00,2021-05-15T00:00:00
2,Property Maintenance,1010966128,2013-07-19T00:00:00,2013-09-23T00:00:00,,YES,Code Enforcement Services,Code Enforcement (IntExp),Junk Vehicle On Private Property,Open,Web Portal,"842 KIRK PL, San Antonio, 78226",5,2116192.0,13692260.0,2020-05-15T00:00:00,2021-05-15T00:00:00
3,Property Maintenance,1011052825,2013-09-16T00:00:00,2013-09-30T00:00:00,,YES,Code Enforcement Services,Code Enforcement,Right Of Way/Sidewalk Obstruction,Open,Internal Services Requests,"54 KENROCK RIDGE, San Antonio, 78254",7,2082242.0,13737817.0,2020-05-15T00:00:00,2021-05-15T00:00:00
4,Property Maintenance,1011052826,2013-09-16T00:00:00,2013-09-30T00:00:00,,YES,Code Enforcement Services,Code Enforcement,Right Of Way/Sidewalk Obstruction,Open,Internal Services Requests,"74 KENROCK RIDGE, San Antonio, 78254",7,2082389.0,13737877.0,2020-05-15T00:00:00,2021-05-15T00:00:00


In [8]:
df.shape

(495440, 17)

In [9]:
# clean data
df = clean_311(df)

In [10]:
df = dummy_dept(df)

In [11]:
df = make_source_id_dummies(df)

In [12]:
df = dummy_call_reason(df)

In [13]:
df.head()

Unnamed: 0,case_id,open_date,due_date,closed_date,is_late,dept,call_reason,case_type,case_status,source_id,address,council_district,longitude,latitude,days_open,resolution_days_due,days_before_or_after_due,level_of_delay,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,zipcode,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste
2497,1014281655,2018-02-28,2018-07-09,2020-06-06,YES,Code Enforcement Services,code,Certificates of Occupancy,Closed,Internal Services Requests,"5550 EASTERLING, San Antonio, 78251",6,2067118.0,13723868.0,829.0,131,-698.0,Extremely Late Response,0,0,0,0,0,0,1,0,0,0,0,78251,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2498,1014281665,2018-02-28,2018-07-09,2020-06-06,YES,Code Enforcement Services,code,Zoning: District Requirement,Closed,Internal Services Requests,"5550 EASTERLING, San Antonio, 78251",6,2067118.0,13723868.0,829.0,131,-698.0,Extremely Late Response,0,0,0,0,0,0,1,0,0,0,0,78251,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2499,1014281677,2018-02-28,2018-07-09,2020-06-06,YES,Code Enforcement Services,code,Certificates of Occupancy,Closed,Internal Services Requests,"5550 EASTERLING, San Antonio, 78251",6,2067118.0,13723868.0,829.0,131,-698.0,Extremely Late Response,0,0,0,0,0,0,1,0,0,0,0,78251,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2556,1014300321,2018-03-06,2020-09-24,2020-05-18,NO,Development Services,buildings,Street Light Existing Res Sub Div,Closed,Constituent Call,"1018 KING AVE, San Antonio, 78211",4,2108015.0,13680344.0,804.0,933,129.0,Very Early Response,0,0,0,0,1,0,0,0,0,0,0,78211,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2657,1014326068,2018-03-14,2018-07-23,2020-05-27,YES,Code Enforcement Services,code,Zoning: District Requirement,Closed,Internal Services Requests,"11800 FISCHER RD, San Antonio, 78073",4,2091672.0,13658660.0,805.0,131,-674.0,Extremely Late Response,0,0,0,0,1,0,0,0,0,0,0,78073,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [14]:
df.shape

(428870, 57)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428870 entries, 2497 to 495432
Data columns (total 57 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   case_id                     428870 non-null  int64         
 1   open_date                   428870 non-null  datetime64[ns]
 2   due_date                    428870 non-null  datetime64[ns]
 3   closed_date                 428870 non-null  datetime64[ns]
 4   is_late                     428870 non-null  object        
 5   dept                        428870 non-null  object        
 6   call_reason                 428870 non-null  object        
 7   case_type                   428870 non-null  object        
 8   case_status                 428870 non-null  object        
 9   source_id                   428870 non-null  object        
 10  address                     428870 non-null  object        
 11  council_district            428870 n

In [16]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = split_separate_scale(df)

In [17]:
train.head()

Unnamed: 0,case_id,open_date,due_date,closed_date,is_late,dept,call_reason,case_type,case_status,source_id,address,council_district,longitude,latitude,days_open,resolution_days_due,days_before_or_after_due,level_of_delay,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,zipcode,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste
260102,1016484102,2020-10-30,2020-11-04,2020-10-31,NO,Solid Waste Management,waste,No Pickup,Closed,Web Portal,"1302 W HARDING BLVD, SAN ANTONIO, 78221",3,2122015.0,13675610.0,1.0,5,4.0,Early Response,0,0,0,1,0,0,0,0,0,0,0,78221.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
202138,1016390561,2020-09-10,2020-09-17,2020-09-14,NO,Solid Waste Management,customer_service,Customer Service(Complaint),Closed,Web Portal,"3418 LONE VALLEY, SAN ANTONIO, 78247",10,2149891.0,13757711.0,4.0,7,3.0,Early Response,0,0,0,0,0,0,0,0,0,0,1,78247.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
210324,1016401694,2020-09-17,2020-09-19,2020-09-17,NO,Solid Waste Management,misc,Dead Animal Pick Up,Closed,Web Portal,"1903 SAUNDERS, SAN ANTONIO, 78207",5,2121436.0,13702209.0,0.0,2,2.0,Early Response,0,0,0,0,0,1,0,0,0,0,0,78207.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
182502,1016364931,2020-08-25,2020-10-28,2020-08-31,NO,Development Services,code,Overgrown Yard/Trash,Closed,Web Portal,"615 CLEARCREST, SAN ANTONIO, 78227",6,2081335.0,13704368.0,6.0,64,58.0,Early Response,0,0,0,0,0,0,1,0,0,0,0,78227.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
219960,1016416005,2020-09-25,2020-09-29,2020-09-25,NO,Unknown,traffic,Traffic Signals (Maintenance_Emergency),Closed,Web Portal,BANDERA RD and MAINLAND DR,7,2084733.0,13735038.0,0.0,4,4.0,Early Response,0,0,0,0,0,0,0,1,0,0,0,,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [18]:
train.shape

(240167, 57)

In [19]:
train_scaled.shape

(240167, 45)

In [20]:
y_train.shape

(240167,)

# Establish Baseline

In [21]:
# look at values of target variable

y_train.value_counts()

Early Response              181108
On Time Response             33203
Very Early Response          17279
Late Response                 5839
Very Late Response            2021
Extremely Late Response        635
Extremely Early Response        82
Name: level_of_delay, dtype: int64

In [22]:
# set up as dataframes
y_train = pd.DataFrame(dict(actual=y_train))
y_validate = pd.DataFrame(dict(actual=y_validate))
y_test = pd.DataFrame(dict(actual=y_test))

In [23]:
# 'Early Response' is by far the most frequent value so that will be our baseline
y_train['baseline'] = 'Early Response'

In [24]:
y_train.head()

Unnamed: 0,actual,baseline
260102,Early Response,Early Response
202138,Early Response,Early Response
210324,Early Response,Early Response
182502,Early Response,Early Response
219960,Early Response,Early Response


In [25]:
# calculate accuracy of baseline

print(' Baseline Accuracy: {:.2%}'.format(accuracy_score(y_train.actual, y_train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.baseline, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.baseline))

 Baseline Accuracy: 75.41%
---
Confusion Matrix
actual          Extremely Late Response  Very Late Response  Late Response  \
baseline                                                                     
Early Response                      635                2021           5839   

actual          On Time Response  Early Response  Very Early Response  \
baseline                                                                
Early Response             33203          181108                17279   

actual          Extremely Early Response  
baseline                                  
Early Response                        82  
---
                          precision    recall  f1-score   support

          Early Response       0.75      1.00      0.86    181108
Extremely Early Response       0.00      0.00      0.00        82
 Extremely Late Response       0.00      0.00      0.00       635
           Late Response       0.00      0.00      0.00      5839
        On Time Response       0.

#### Takeaways

- Baseline of 'Early Response' is correct about 75% of the time

- This could be a difficult baseline to beat 

# Modeling

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240167 entries, 260102 to 33841
Data columns (total 57 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   case_id                     240167 non-null  int64         
 1   open_date                   240167 non-null  datetime64[ns]
 2   due_date                    240167 non-null  datetime64[ns]
 3   closed_date                 240167 non-null  datetime64[ns]
 4   is_late                     240167 non-null  object        
 5   dept                        240167 non-null  object        
 6   call_reason                 240167 non-null  object        
 7   case_type                   240167 non-null  object        
 8   case_status                 240167 non-null  object        
 9   source_id                   240167 non-null  object        
 10  address                     240167 non-null  object        
 11  council_district            240167 

In [27]:
# drop columns that contain the answer to prevent answer leaking

train = train.drop(columns = ['closed_date', 'is_late', 'days_before_or_after_due', 'days_open', 'case_status'])
validate = validate.drop(columns = ['closed_date', 'is_late', 'days_before_or_after_due', 'days_open', 'case_status'])
test = test.drop(columns = ['closed_date', 'is_late', 'days_before_or_after_due', 'days_open', 'case_status'])

X_train = X_train.drop(columns = ['closed_date', 'is_late', 'days_before_or_after_due', 'days_open', 'case_status'])
X_validate = X_validate.drop(columns = ['closed_date', 'is_late', 'days_before_or_after_due', 'days_open', 'case_status'])
X_test = X_test.drop(columns = ['closed_date', 'is_late', 'days_before_or_after_due', 'days_open', 'case_status'])

train_scaled = train_scaled.drop(columns = ['days_before_or_after_due', 'days_open'])
validate_scaled = validate_scaled.drop(columns = ['days_before_or_after_due', 'days_open'])
test_scaled = test_scaled.drop(columns = ['days_before_or_after_due', 'days_open'])

In [28]:
train_scaled.head()

Unnamed: 0,case_id,council_district,longitude,latitude,resolution_days_due,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,web_portal,311_mobile_app,constituent_call,internal_services_requests,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste
0,0.850745,0.3,0.446899,0.334668,0.009381,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.814613,1.0,0.582167,0.723116,0.013133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.818913,0.5,0.44409,0.460517,0.003752,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.804713,0.6,0.2495,0.470732,0.120075,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.824441,0.7,0.265989,0.615842,0.007505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [29]:
# make df of only chosen features for model

model_train = keep_info(X_train)
model_validate = keep_info(X_validate)
model_test = keep_info(X_test)

In [30]:
model_train.head()

Unnamed: 0,dept,call_reason,source_id,council_district,resolution_days_due
260102,Solid Waste Management,waste,Web Portal,3,5
202138,Solid Waste Management,customer_service,Web Portal,10,7
210324,Solid Waste Management,misc,Web Portal,5,2
182502,Development Services,code,Web Portal,6,64
219960,Unknown,traffic,Web Portal,7,4


In [31]:
model_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240167 entries, 260102 to 33841
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   dept                 240167 non-null  object
 1   call_reason          240167 non-null  object
 2   source_id            240167 non-null  object
 3   council_district     240167 non-null  int64 
 4   resolution_days_due  240167 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.0+ MB


In [32]:
model_train.shape

(240167, 5)

### Model 1: Logistic Regression - Default Parameters / All Features

In [33]:
# make logistic regression object
log1 = LogisticRegression(random_state=123)

In [34]:
# fit logistic regression object to data
log1.fit(train_scaled, y_train.actual)

LogisticRegression(random_state=123)

In [35]:
# evaluate feature importance
print('Coefficient: \n', log1.coef_)
print('Intercept: \n', log1.intercept_)

Coefficient: 
 [[ 8.60577651e+00 -2.86886139e-02  3.70772026e-03  3.12116202e-01
  -1.09590725e+01 -6.15362792e-01  1.91253061e-01 -1.46487504e-01
  -2.09293381e-01 -1.87130656e-01 -1.20848602e-01 -2.38245517e-01
   1.48819160e-01  7.20023016e-02  7.46992181e-02  9.34909194e-02
   1.90851772e+00 -1.77615602e+00 -5.82637679e+00 -8.99772797e-01
   1.39445364e+00 -1.84285552e-01  3.58796645e+00  4.04442875e-01
   4.54106678e-01  4.31643210e-01 -1.34572126e-01 -1.98536088e+00
   7.51186005e-01 -6.02128473e-01 -4.00623475e-02  6.98252024e-03
   1.25928657e+00 -3.03422988e+00 -2.33480327e-01 -7.85588442e-01
   8.42048807e-01  1.46166865e-01  1.88409633e+00  4.20555344e-01
   3.23450841e-01  7.99092636e-01 -1.92329424e+00]
 [-9.66959295e-01 -3.68964570e-01 -8.48760203e-01 -8.05535877e-01
   6.32264831e+00 -5.70927210e-02 -5.34147750e-01 -2.76087806e-01
  -7.02352961e-01 -1.85831569e-01  1.24839276e-01  2.25158803e-01
  -1.53339878e-01 -8.76896056e-02 -1.45378886e-01  1.35521958e-01
  -7.19240

In [36]:
# make predictions
y_train['log1_pred'] = log1.predict(train_scaled)
y_validate['log1_pred'] = log1.predict(validate_scaled)
y_test['log1_pred'] = log1.predict(test_scaled)

In [37]:
y_train.head()

Unnamed: 0,actual,baseline,log1_pred
260102,Early Response,Early Response,Early Response
202138,Early Response,Early Response,Early Response
210324,Early Response,Early Response,Early Response
182502,Early Response,Early Response,Early Response
219960,Early Response,Early Response,Early Response


- Evaluate Model

In [38]:
# calculate accuracy of log1 on train

print(' Log1 Accuracy: {:.5%}'.format(accuracy_score(y_train.actual, y_train.log1_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.log1_pred, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.log1_pred))

 Log1 Accuracy: 81.89468%
---
Confusion Matrix
actual                    Extremely Late Response  Very Late Response  \
log1_pred                                                               
Early Response                                  8                 876   
Extremely Early Response                        0                   0   
Extremely Late Response                       446                 325   
Late Response                                  83                 683   
On Time Response                                0                   4   
Very Early Response                             0                   9   
Very Late Response                             98                 124   

actual                    Late Response  On Time Response  Early Response  \
log1_pred                                                                   
Early Response                     4071             32294          178699   
Extremely Early Response              0                 2       

In [39]:
# calculate accuracy of log1 on validate

print(' Log1 Accuracy: {:.5%}'.format(accuracy_score(y_validate.actual, y_validate.log1_pred)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_validate.log1_pred, y_validate.actual))
print('---')
print(classification_report(y_validate.actual, y_validate.log1_pred))

 Log1 Accuracy: 81.89140%
---
Confusion Matrix
actual                    Extremely Late Response  Very Late Response  \
log1_pred                                                               
Early Response                                  5                 376   
Extremely Early Response                        0                   0   
Extremely Late Response                       187                 137   
Late Response                                  39                 285   
On Time Response                                0                   3   
Very Early Response                             0                   5   
Very Late Response                             41                  60   

actual                    Late Response  On Time Response  Early Response  \
log1_pred                                                                   
Early Response                     1763             13839           76591   
Extremely Early Response              0                 2       

#### Takeaways

- This model performs moderately better than baseline

- performs worse on validate than on train, but the difference in negligable 

### Model 2: Logistic Regresson - Default Parameters / Chosen Features 

In [40]:
model_train.head()

Unnamed: 0,dept,call_reason,source_id,council_district,resolution_days_due
260102,Solid Waste Management,waste,Web Portal,3,5
202138,Solid Waste Management,customer_service,Web Portal,10,7
210324,Solid Waste Management,misc,Web Portal,5,2
182502,Development Services,code,Web Portal,6,64
219960,Unknown,traffic,Web Portal,7,4


In [41]:
model_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240167 entries, 260102 to 33841
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   dept                 240167 non-null  object
 1   call_reason          240167 non-null  object
 2   source_id            240167 non-null  object
 3   council_district     240167 non-null  int64 
 4   resolution_days_due  240167 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.0+ MB


In [42]:
model_train = dummy_dept(model_train)

In [43]:
model_train = dummy_call_reason(model_train)

In [44]:
model_train = create_dummies(model_train)

In [45]:
model_train = make_source_id_dummies(model_train)

In [46]:
model_train = model_train.drop(columns = ['dept', 'call_reason', 'council_district', 'source_id'])

In [47]:
model_train.head()

Unnamed: 0,resolution_days_due,animal_care_services,code_enforcement_services,customer_services,development_services,metro_health,parks_and_rec,solid_waste_management,trans_and_cap_improvements,unknown_dept,buildings,business,cleanup,code,customer_service,field,land,license,misc,storm,streets,trades,traffic,waste,district_0,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10,web_portal,311_mobile_app,constituent_call,internal_services_requests
260102,5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
202138,7,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
210324,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
182502,64,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
219960,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
