# Begin 2nd Iteration

In [1]:
import wrangle as w
import model as m
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

## Model.py (customized)

In [2]:
import pandas as pd
import numpy as np
import wrangle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing

#-----------------------------------------------------------------------------

# Model Prep


def dummy_dept(df):
    # dummy dept feature
    dummy_df =  pd.get_dummies(df['dept'])
    # Name the new columns
    dummy_df.columns = ['animal_care_services', 'code_enforcement_services', 
                        'customer_services', 'development_services', 
                        'metro_health', 'parks_and_rec',
                        'solid_waste_management', 'trans_and_cap_improvements', 
                        'unknown_dept']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df
#-----------------------------------------------------------------------------    
def dummy_call_reason(df):
    # dummy dept feature
    dummy_df =  pd.get_dummies(df['call_reason'])
    # Name the new columns
    dummy_df.columns = ['buildings', 'business', 'cleanup', 'code',
                        'customer_service', 'field', 'land',
                        'license', 'misc', 'storm', 'streets', 'trades', 
                        'traffic', 'waste']
    # add the dummies to the data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df
#-----------------------------------------------------------------------------
def make_source_id_dummies(df):
    '''This function takes in the cleaned dataframe, makes dummy variables of the source id column, readds the names of the
    dummy columns and returns the concatenated dummy dataframe to the original dataframe.'''
    #make dummies
    dummy_df = pd.get_dummies(df['source_id'])
    #add back column names
    dummy_df.columns = ['web_portal', '311_mobile_app', 'constituent_call', 'internal_services_requests']
    # concatenate dummies to the cleaned data frame
    df = pd.concat([df, dummy_df], axis=1)
    return df

#-------------------------------
#zipcode dummies
def dummy_zipcodes(df):
    dummy = pd.get_dummies(df['zipcode'])
    df = pd.concat([df, dummy], axis=1)
    return df

#-------------------------------
def keep_info(df):
    df.drop(df.columns.difference(['dept','call_reason', 'source_id', 'level_of_delay',
                                   'council_district', 'resolution_days_due', 'district_0', 'district_1', 'district_2',
                                   'district_3', 'district_4','district_5', 'district_6', 'district_7', 'district_8', 
                                   'district_9','district_10', 'per_capita_income', 'zipcode']), 1, inplace=True)
    return df

#--------------------------------
def model_df():
    '''This function reads in the clean 311 dataframe, applies all of the above functions to prepare it for modeling. 
    The function then returns a cleaned dataframe ready for modeling.'''
    df= wrangle.clean_311(wrangle.get_311_data())
    df= keep_info(df)
    df= dummy_dept(df)
    df= dummy_call_reason(df)
    df= make_source_id_dummies(df)
    df= dummy_zipcodes(df)

    return df
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def split(df, stratify_by= 'level_of_delay'):
    """
    Crude train, validate, test split
    To stratify, send in a column name
    """
    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=319)
        train, validate = train_test_split(train, test_size=.3, random_state=319)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=319, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=319, stratify=train[stratify_by])
    return train, validate, test


#------------------------------------
def separate_y(train, validate, test):
    '''
    This function will take the train, validate, and test dataframes and separate the target variable into its
    own panda series
    '''
    
    X_train = train.drop(columns=['level_of_delay'])
    y_train = train.level_of_delay
    X_validate = validate.drop(columns=['level_of_delay'])
    y_validate = validate.level_of_delay
    X_test = test.drop(columns=['level_of_delay'])
    y_test = test.level_of_delay
    return X_train, y_train, X_validate, y_validate, X_test, y_test
#------------------------------------
def scale_data(X_train, X_validate, X_test):
    '''
    This function will scale numeric data using Min Max transform after 
    it has already been split into train, validate, and test.
    '''
    
    
    obj_col = []
    num_train = X_train.drop(columns = obj_col)
    num_validate = X_validate.drop(columns = obj_col)
    num_test = X_test.drop(columns = obj_col)
    
    
    # Make the thing
    scaler = preprocessing.MinMaxScaler()
    
   
    # we only .fit on the training data
    scaler.fit(num_train)
    train_scaled = scaler.transform(num_train)
    validate_scaled = scaler.transform(num_validate)
    test_scaled = scaler.transform(num_test)
    
    # turn the numpy arrays into dataframes
    train_scaled = pd.DataFrame(train_scaled, columns=num_train.columns)
    validate_scaled = pd.DataFrame(validate_scaled, columns=num_train.columns)
    test_scaled = pd.DataFrame(test_scaled, columns=num_train.columns)
    
    
    return train_scaled, validate_scaled, test_scaled

#------------------------------------

def split_separate_scale(df, stratify_by= 'level_of_delay'):
    '''
    This function will take in a dataframe
    separate the dataframe into train, validate, and test dataframes
    separate the target variable from train, validate and test
    then it will scale the numeric variables in train, validate, and test
    finally it will return all dataframes individually
    '''
    
    # split data into train, validate, test
    train, validate, test = split(df, stratify_by= 'level_of_delay')
    
     # seperate target variable
    X_train, y_train, X_validate, y_validate, X_test, y_test = separate_y(train, validate, test)
    
    
    # scale numeric variable
    train_scaled, validate_scaled, test_scaled = scale_data(X_train, X_validate, X_test)
    
    return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled

In [3]:
df = w.clean_311(w.get_311_data())

  df['open_week'] = df.open_date.dt.week


In [4]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [5]:
df.isna().sum()

open_date                       0
due_date                        0
closed_date                 43182
is_late                         0
dept                            0
call_reason                     0
case_type                       0
case_status                     0
source_id                       0
address                         0
council_district                0
longitude                       0
latitude                        0
days_open                       0
resolution_days_due             0
days_before_or_after_due    43182
pct_time_of_used                0
level_of_delay                  0
district_1                      0
district_2                      0
district_3                      0
district_4                      0
district_5                      0
district_6                      0
district_7                      0
district_8                      0
district_9                      0
district_10                     0
voter_turnout_2019              0
num_of_registe

In [6]:
df.shape

(399986, 36)

In [7]:
df.dropna(subset=['days_before_or_after_due'], how='all', inplace=True)

In [8]:
df.dropna(subset=['closed_date'], how='all', inplace=True)

In [9]:
df.isna().sum()

open_date                   0
due_date                    0
closed_date                 0
is_late                     0
dept                        0
call_reason                 0
case_type                   0
case_status                 0
source_id                   0
address                     0
council_district            0
longitude                   0
latitude                    0
days_open                   0
resolution_days_due         0
days_before_or_after_due    0
pct_time_of_used            0
level_of_delay              0
district_1                  0
district_2                  0
district_3                  0
district_4                  0
district_5                  0
district_6                  0
district_7                  0
district_8                  0
district_9                  0
district_10                 0
voter_turnout_2019          0
num_of_registered_voters    0
zipcode                     0
open_month                  0
open_year                   0
open_week 

In [10]:
df.shape

(356804, 36)

In [11]:
df.head()

Unnamed: 0,open_date,due_date,closed_date,is_late,dept,call_reason,case_type,case_status,source_id,address,...,district_9,district_10,voter_turnout_2019,num_of_registered_voters,zipcode,open_month,open_year,open_week,per_capita_income,square_miles
551,2017-01-27,2017-02-03,2020-11-19,YES,Customer Service,customer_service,Complaint,Closed,Constituent Call,"2407 WYOMING ST, San Antonio, 78203",...,0,0,0.086,67656,78203,1,2017,4,19055,59.81
552,2017-01-27,2017-02-03,2020-11-19,YES,Customer Service,customer_service,Complaint,Closed,Constituent Call,"5102 OLD PEARSALL, San Antonio, 78242",...,0,0,0.078,66370,78242,1,2017,4,18500,65.21
553,2017-02-03,2017-02-10,2020-11-19,YES,Customer Service,customer_service,Complaint,Closed,Constituent Call,"2223 HOUSTON ST E, San Antonio, 78202",...,0,0,0.086,67656,78202,2,2017,5,19055,59.81
554,2017-02-03,2017-02-10,2020-11-19,YES,Customer Service,customer_service,Complaint,Closed,Constituent Call,"2531 PEREZ, San Antonio, 78207",...,0,0,0.148,68081,78207,2,2017,5,23967,26.0
555,2017-02-28,2017-03-07,2020-11-19,YES,Customer Service,customer_service,Complaint,Closed,Constituent Call,"8002 GRISSOM RD, San Antonio, 78251",...,0,0,0.124,80007,78251,2,2017,9,23437,38.44


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   open_date                 356804 non-null  datetime64[ns]
 1   due_date                  356804 non-null  datetime64[ns]
 2   closed_date               356804 non-null  datetime64[ns]
 3   is_late                   356804 non-null  object        
 4   dept                      356804 non-null  object        
 5   call_reason               356804 non-null  object        
 6   case_type                 356804 non-null  object        
 7   case_status               356804 non-null  object        
 8   source_id                 356804 non-null  object        
 9   address                   356804 non-null  object        
 10  council_district          356804 non-null  int64         
 11  longitude                 356804 non-null  float64       
 12  

In [None]:
#df.to_csv('cleanest_311.csv')

In [None]:
#zipcode dummies
#def dummy_zipcodes(df):
    #dummy = pd.get_dummies(df['zipcode'])
    #df = pd.concat([df, dummy], axis=1)
    #return df

In [None]:
#dummy = pd.get_dummies(df['zipcode'])
#dummy.head()

In [None]:
#df = pd.concat([df, dummy], axis=1)
#df.head()

In [13]:
df = model_df()

  df['open_week'] = df.open_date.dt.week


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399986 entries, 0 to 399985
Columns: 112 entries, dept to 78288
dtypes: category(1), int64(3), object(4), uint8(104)
memory usage: 64.5+ MB


In [15]:
df = df.drop(columns = ['dept', 'call_reason', 
        'source_id', 'council_district'])

In [17]:
df = df.drop(columns = ['zipcode'])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399986 entries, 0 to 399985
Columns: 107 entries, resolution_days_due to 78288
dtypes: category(1), int64(2), uint8(104)
memory usage: 49.2 MB


In [19]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = m.split_separate_scale(df)

### baseline

In [20]:
#baseline
train.level_of_delay.value_counts()

Extremely Early Response    142657
Very Early Response          35776
Late Response                17032
Early Response               13609
On Time Response             11761
Very Late Response            2510
Extremely Late Response        646
Name: level_of_delay, dtype: int64

In [21]:
#baseline accuracy will be early response
baseline = round((train.level_of_delay == 'Extremely Early Response').mean(), 2) *100

print(f'The baseline accuracy is: {baseline} %')

The baseline accuracy is: 64.0 %


### DT

In [22]:
#make the thing
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
#fit the thing
clf = clf.fit(X_train, y_train)
#predicitons
y_pred = clf.predict(X_train)
#probability
y_pred_proba = clf.predict_proba(X_train)

In [23]:
#compute the accuracy 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.64


In [24]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.64


In [25]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

          Early Response       0.00      0.00      0.00      5833
Extremely Early Response       0.66      0.97      0.79     61139
 Extremely Late Response       0.00      0.00      0.00       276
           Late Response       0.00      0.00      0.00      7300
        On Time Response       0.00      0.00      0.00      5040
     Very Early Response       0.31      0.11      0.17     15333
      Very Late Response       0.68      0.43      0.53      1076

                accuracy                           0.64     95997
               macro avg       0.24      0.22      0.21     95997
            weighted avg       0.48      0.64      0.54     95997



### DT 2

In [26]:
#make the thing
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
#fit the thing
clf = clf.fit(X_train, y_train)
#predicitons
y_pred = clf.predict(X_train)
#probability
y_pred_proba = clf.predict_proba(X_train)

In [27]:
#compute the accuracy 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.65


In [28]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.65


### DT 3

In [29]:
#make the thing
clf = DecisionTreeClassifier(max_depth=12, random_state=123)
#fit the thing
clf = clf.fit(X_train, y_train)
#predicitons
y_pred = clf.predict(X_train)
#probability
y_pred_proba = clf.predict_proba(X_train)

In [30]:
#compute the accuracy 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.66


In [31]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.65


### Vanilla KNN

In [None]:
df.case_type.value_counts()[:30]

In [None]:
df.info()

In [None]:
df = m.model_df()

In [None]:
df.info()