# Modeling

## Models to Run:
- KNN
- DT
- Find new algo

### Imports

In [1]:
import wrangle as w
import model as m
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score


### Acquire and Prepare

In [2]:
df = w.clean_311(w.get_311_data())

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356715 entries, 1014281655 to 1016870499
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   open_date                 356715 non-null  datetime64[ns]
 1   due_date                  356715 non-null  datetime64[ns]
 2   closed_date               356715 non-null  datetime64[ns]
 3   is_late                   356715 non-null  object        
 4   dept                      356715 non-null  object        
 5   call_reason               356715 non-null  object        
 6   case_type                 356715 non-null  object        
 7   case_status               356715 non-null  object        
 8   source_id                 356715 non-null  object        
 9   address                   356715 non-null  object        
 10  council_district          356715 non-null  int64         
 11  longitude                 356715 non-null  float64  

In [4]:
df = m.model_df()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356715 entries, 1014281655 to 1016870499
Data columns (total 44 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   dept                        356715 non-null  object  
 1   call_reason                 356715 non-null  object  
 2   source_id                   356715 non-null  object  
 3   council_district            356715 non-null  int64   
 4   resolution_days_due         356715 non-null  int64   
 5   level_of_delay              356715 non-null  category
 6   district_0                  356715 non-null  uint8   
 7   district_1                  356715 non-null  uint8   
 8   district_2                  356715 non-null  uint8   
 9   district_3                  356715 non-null  uint8   
 10  district_4                  356715 non-null  uint8   
 11  district_5                  356715 non-null  uint8   
 12  district_6                  356715 non-null  

#### Drop original dept, source_id, and call_reason

In [6]:
df.drop(columns=['dept', 
                 'call_reason', 
                 'source_id' ], inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356715 entries, 1014281655 to 1016870499
Data columns (total 41 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   council_district            356715 non-null  int64   
 1   resolution_days_due         356715 non-null  int64   
 2   level_of_delay              356715 non-null  category
 3   district_0                  356715 non-null  uint8   
 4   district_1                  356715 non-null  uint8   
 5   district_2                  356715 non-null  uint8   
 6   district_3                  356715 non-null  uint8   
 7   district_4                  356715 non-null  uint8   
 8   district_5                  356715 non-null  uint8   
 9   district_6                  356715 non-null  uint8   
 10  district_7                  356715 non-null  uint8   
 11  district_8                  356715 non-null  uint8   
 12  district_9                  356715 non-null  

#### Splitty Split

In [8]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = m.split_separate_scale(df, stratify_by= 'level_of_delay')

In [9]:
train.shape

(199760, 41)

In [10]:
validate.shape

(85612, 41)

In [11]:
test.shape

(71343, 41)

In [12]:
train.head()

Unnamed: 0_level_0,council_district,resolution_days_due,level_of_delay,district_0,district_1,district_2,district_3,district_4,district_5,district_6,...,misc,storm,streets,trades,traffic,waste,web_portal,311_mobile_app,constituent_call,internal_services_requests
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1016517770,5,2,Early Response,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1016203792,2,9,Early Response,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1016771848,5,3,Early Response,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1016634860,10,6,Early Response,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1016302671,4,1,On Time Response,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356715 entries, 1014281655 to 1016870499
Data columns (total 41 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   council_district            356715 non-null  int64   
 1   resolution_days_due         356715 non-null  int64   
 2   level_of_delay              356715 non-null  category
 3   district_0                  356715 non-null  uint8   
 4   district_1                  356715 non-null  uint8   
 5   district_2                  356715 non-null  uint8   
 6   district_3                  356715 non-null  uint8   
 7   district_4                  356715 non-null  uint8   
 8   district_5                  356715 non-null  uint8   
 9   district_6                  356715 non-null  uint8   
 10  district_7                  356715 non-null  uint8   
 11  district_8                  356715 non-null  uint8   
 12  district_9                  356715 non-null  

#### Create the baseline

In [14]:
#baseline
train.level_of_delay.value_counts()

Early Response              148198
On Time Response             28857
Very Early Response          15299
Late Response                 5005
Very Late Response            1763
Extremely Late Response        555
Extremely Early Response        83
Name: level_of_delay, dtype: int64

In [15]:
#baseline accuracy will be early response
baseline = round((train.level_of_delay == 'Early Response').mean(), 2) *100

print(f'The baseline accuracy is: {baseline} %')

The baseline accuracy is: 74.0 %


#### Decision Tree

In [16]:
#make the thing
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
#fit the thing
clf = clf.fit(X_train, y_train)
#predicitons
y_pred = clf.predict(X_train)
#probability
y_pred_proba = clf.predict_proba(X_train)

#### Evaluate Decision Tree

In [17]:
#compute the accuracy 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.84


In [18]:
#classification report
print(classification_report(y_train, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

          Early Response       0.84      0.99      0.91    148198
Extremely Early Response       0.98      1.00      0.99        83
 Extremely Late Response       0.00      0.00      0.00       555
           Late Response       0.00      0.00      0.00      5005
        On Time Response       0.99      0.21      0.34     28857
     Very Early Response       0.77      1.00      0.87     15299
      Very Late Response       0.00      0.00      0.00      1763

                accuracy                           0.84    199760
               macro avg       0.51      0.46      0.44    199760
            weighted avg       0.83      0.84      0.79    199760



In [19]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.84


In [20]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

          Early Response       0.84      0.99      0.91     63514
Extremely Early Response       0.97      1.00      0.99        35
 Extremely Late Response       0.00      0.00      0.00       238
           Late Response       0.00      0.00      0.00      2145
        On Time Response       0.99      0.21      0.34     12368
     Very Early Response       0.78      1.00      0.88      6557
      Very Late Response       0.00      0.00      0.00       755

                accuracy                           0.84     85612
               macro avg       0.51      0.46      0.45     85612
            weighted avg       0.83      0.84      0.79     85612



In [21]:
train.columns.to_list()

['council_district',
 'resolution_days_due',
 'level_of_delay',
 'district_0',
 'district_1',
 'district_2',
 'district_3',
 'district_4',
 'district_5',
 'district_6',
 'district_7',
 'district_8',
 'district_9',
 'district_10',
 'animal_care_services',
 'code_enforcement_services',
 'customer_services',
 'development_services',
 'metro_health',
 'parks_and_rec',
 'solid_waste_management',
 'trans_and_cap_improvements',
 'unknown_dept',
 'buildings',
 'business',
 'cleanup',
 'code',
 'customer_service',
 'field',
 'land',
 'license',
 'misc',
 'storm',
 'streets',
 'trades',
 'traffic',
 'waste',
 'web_portal',
 '311_mobile_app',
 'constituent_call',
 'internal_services_requests']

#### Vanilla KNN

In [22]:
#Features I want to train on
features1 = ['council_district',
 'resolution_days_due',
 'district_0',
 'district_1',
 'district_2',
 'district_3',
 'district_4',
 'district_5',
 'district_6',
 'district_7',
 'district_8',
 'district_9',
 'district_10',
 'animal_care_services',
 'code_enforcement_services',
 'customer_services',
 'development_services',
 'metro_health',
 'parks_and_rec',
 'solid_waste_management',
 'trans_and_cap_improvements',
 'unknown_dept',
 'buildings',
 'business',
 'cleanup',
 'code',
 'customer_service',
 'field',
 'land',
 'license',
 'misc',
 'storm',
 'streets',
 'trades',
 'traffic',
 'waste',
 'web_portal',
 '311_mobile_app',
 'constituent_call',
 'internal_services_requests']
#make model
#weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')
#fit model
knn.fit(X_train[features1], y_train)

KNeighborsClassifier(n_neighbors=3)

In [23]:
#mkae predictions
y_pred_knn = knn.predict(X_train[features1])
#estimate probability
y_pred_proba = knn.predict_proba(X_train[features1])

In [24]:
#accuracy
accuracy = knn.score(X_train[features1], y_train)
print(f"KNN Accuracy is {accuracy:.5}")

KNN Accuracy is 0.82906


In [25]:
#classification report
#confustion matrix
print(classification_report(y_train, y_pred_knn))

                          precision    recall  f1-score   support

          Early Response       0.85      0.95      0.90    148198
Extremely Early Response       0.97      1.00      0.98        83
 Extremely Late Response       0.68      0.65      0.67       555
           Late Response       0.31      0.14      0.19      5005
        On Time Response       0.64      0.32      0.43     28857
     Very Early Response       0.84      0.89      0.86     15299
      Very Late Response       0.69      0.24      0.35      1763

                accuracy                           0.83    199760
               macro avg       0.71      0.60      0.63    199760
            weighted avg       0.81      0.83      0.81    199760



#### Validate KNN

In [26]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate[features1], y_validate)))


Accuracy of KNN classifier on test set: 0.82


#### Chocolate KNN

In [27]:
#Features I want to train on
features1 = ['council_district',
 'resolution_days_due',
 'district_0',
 'district_1',
 'district_2',
 'district_3',
 'district_4',
 'district_5',
 'district_6',
 'district_7',
 'district_8',
 'district_9',
 'district_10',
 'animal_care_services',
 'code_enforcement_services',
 'customer_services',
 'development_services',
 'metro_health',
 'parks_and_rec',
 'solid_waste_management',
 'trans_and_cap_improvements',
 'unknown_dept',
 'buildings',
 'business',
 'cleanup',
 'code',
 'customer_service',
 'field',
 'land',
 'license',
 'misc',
 'storm',
 'streets',
 'trades',
 'traffic',
 'waste',
 'web_portal',
 '311_mobile_app',
 'constituent_call',
 'internal_services_requests']
#make model
#weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
#fit model
knn.fit(X_train[features1], y_train)

KNeighborsClassifier()

In [28]:
#mkae predictions
y_pred_knn = knn.predict(X_train[features1])
#estimate probability
y_pred_proba = knn.predict_proba(X_train[features1])

In [29]:
#accuracy
accuracy = knn.score(X_train[features1], y_train)
print(f"KNN Accuracy is {accuracy:.5}")

KNN Accuracy is 0.83525


In [30]:
#classification report
#confustion matrix
print(classification_report(y_train, y_pred_knn))

                          precision    recall  f1-score   support

          Early Response       0.86      0.96      0.90    148198
Extremely Early Response       0.97      1.00      0.98        83
 Extremely Late Response       0.68      0.60      0.64       555
           Late Response       0.41      0.16      0.23      5005
        On Time Response       0.66      0.33      0.44     28857
     Very Early Response       0.83      0.92      0.87     15299
      Very Late Response       0.66      0.23      0.34      1763

                accuracy                           0.84    199760
               macro avg       0.72      0.60      0.63    199760
            weighted avg       0.81      0.84      0.81    199760



In [31]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate[features1], y_validate)))

Accuracy of KNN classifier on test set: 0.83


### New Feature to Create
- sq ft by days late

- sq miles gathered from:
    https://sa2020.org/city-council-profiles

In [32]:
df = w.clean_311(w.get_311_data())

In [33]:
df.shape

(356715, 29)

In [34]:
df.days_before_or_after_due.value_counts()[:15]

 2.0      35112
 5.0      28584
 4.0      27093
 6.0      26974
 1.0      26369
 0.0      22672
 3.0      22112
 7.0      10257
 10.0      9454
 60.0      7618
-1.0       7070
 63.0      5446
 64.0      5247
 8.0       3759
 132.0     3489
-2.0       3092
 127.0     2923
 62.0      2848
 14.0      2137
-3.0       2048
 9.0       2034
 130.0     1921
 13.0      1902
 128.0     1651
 129.0     1632
 126.0     1571
-4.0       1550
 65.0      1520
 61.0      1421
 66.0      1389
 12.0      1250
 11.0      1222
 55.0      1121
 56.0      1114
 57.0      1106
 28.0      1101
 15.0      1079
Name: days_before_or_after_due, dtype: int64

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356715 entries, 1014281655 to 1016870499
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   open_date                 356715 non-null  datetime64[ns]
 1   due_date                  356715 non-null  datetime64[ns]
 2   closed_date               356715 non-null  datetime64[ns]
 3   is_late                   356715 non-null  object        
 4   dept                      356715 non-null  object        
 5   call_reason               356715 non-null  object        
 6   case_type                 356715 non-null  object        
 7   case_status               356715 non-null  object        
 8   source_id                 356715 non-null  object        
 9   address                   356715 non-null  object        
 10  council_district          356715 non-null  int64         
 11  longitude                 356715 non-null  float64  

In [None]:
df.council_district.value_counts()

In [None]:
# convert square miles to square feet
# 1 sq mi = 27,878,399.996383 sq ft
# maybe dont convert it to sqft bc its so big
d1 = 26.00
d2 = 59.81
d3 = 116.15
d4 = 65.21
d5 = 22.24
d6 = 38.44
d7 = 32.82
d8 = 71.64
d9 = 48.71
d10 = 55.62

In [36]:
df = df[df.council_district != 0]

In [37]:
df.council_district.value_counts()

5     49986
2     46987
1     45889
3     45709
4     34595
7     33447
6     30364
10    29372
9     19437
8     19435
Name: council_district, dtype: int64

In [38]:
df.shape

(355221, 29)

In [39]:
def get_sq_miles(council_district):
    """
    This function will apply the square miles per district
    to each district.
    """
    d1 = 26.00
    d2 = 59.81
    d3 = 116.15
    d4 = 65.21
    d5 = 22.24
    d6 = 38.44
    d7 = 32.82
    d8 = 71.64
    d9 = 48.71
    d10 = 55.62
    if council_district == 1:
        return d1
    elif council_district == 2:
        return d2
    elif council_district == 3:
        return d3
    elif council_district == 4:
        return d4
    elif council_district == 5:
        return d5
    elif council_district == 6:
        return d6
    elif council_district == 7:
        return d7
    elif council_district == 8:
        return d8
    elif council_district == 9:
        return d9
    else:
        return d10
    
def sq_miles_by_days(df):
    """
    This function takes in one positional argument:
    1.  311 df
    This function returns:
    1. square mile per district
    2. the amount of days before or after due per square mile
    """
    # get square miles for each district
    df['sq_miles'] = df['council_district'].apply(get_sq_miles)
    # get the amount of days per square mile
    df['sq_miles_by_days'] = df.sq_miles / df.days_before_or_after_due
    return df

In [None]:
df['sq_miles'] = df['council_district'].apply(get_sq_miles)

In [None]:
df.sq_miles.value_counts()

In [None]:
df['sq_miles_by_days'] = df.sq_miles / df.days_before_or_after_due

In [None]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
df.head()

In [None]:
d1_avg_days_per_sq_mile = df[df.]
waste = train[train.call_reason == 'waste']

In [None]:
df2 = w.clean_311(w.get_311_data())

In [40]:
df = sq_miles_by_days(df)

In [41]:
df.head()

Unnamed: 0_level_0,open_date,due_date,closed_date,is_late,dept,call_reason,case_type,case_status,source_id,address,...,district_4,district_5,district_6,district_7,district_8,district_9,district_10,zipcode,sq_miles,sq_miles_by_days
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1014281655,2018-02-28,2018-07-09,2020-06-06,YES,Code Enforcement Services,code,Certificates of Occupancy,Closed,Internal Services Requests,"5550 EASTERLING, San Antonio, 78251",...,0,0,1,0,0,0,0,78251,38.44,-0.055072
1014281665,2018-02-28,2018-07-09,2020-06-06,YES,Code Enforcement Services,code,Zoning: District Requirement,Closed,Internal Services Requests,"5550 EASTERLING, San Antonio, 78251",...,0,0,1,0,0,0,0,78251,38.44,-0.055072
1014281677,2018-02-28,2018-07-09,2020-06-06,YES,Code Enforcement Services,code,Certificates of Occupancy,Closed,Internal Services Requests,"5550 EASTERLING, San Antonio, 78251",...,0,0,1,0,0,0,0,78251,38.44,-0.055072
1014300321,2018-03-06,2020-09-24,2020-05-18,NO,Development Services,buildings,Street Light Existing Res Sub Div,Closed,Constituent Call,"1018 KING AVE, San Antonio, 78211",...,1,0,0,0,0,0,0,78211,65.21,0.505504
1014326068,2018-03-14,2018-07-23,2020-05-27,YES,Code Enforcement Services,code,Zoning: District Requirement,Closed,Internal Services Requests,"11800 FISCHER RD, San Antonio, 78073",...,1,0,0,0,0,0,0,78073,65.21,-0.096751


In [43]:
df.days_before_or_after_due.value_counts()[:10]

2.0     35087
5.0     28539
4.0     27069
6.0     26922
1.0     26317
0.0     22614
3.0     22075
7.0     10188
10.0     9384
Name: days_before_or_after_due, dtype: int64