# Modeling

## Models to Run:
- KNN
- DT
- Find new algo

### Imports

In [1]:
import wrangle as w
import model as m
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score


### Acquire and Prepare

In [2]:
df = w.clean_311(w.get_311_data())

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   open_date                 356804 non-null  datetime64[ns]
 1   due_date                  356804 non-null  datetime64[ns]
 2   closed_date               356804 non-null  datetime64[ns]
 3   is_late                   356804 non-null  object        
 4   dept                      356804 non-null  object        
 5   call_reason               356804 non-null  object        
 6   case_type                 356804 non-null  object        
 7   case_status               356804 non-null  object        
 8   source_id                 356804 non-null  object        
 9   address                   356804 non-null  object        
 10  council_district          356804 non-null  int64         
 11  longitude                 356804 non-null  float64       
 12  

In [4]:
df = m.model_df()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 43 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   dept                        356804 non-null  object  
 1   call_reason                 356804 non-null  object  
 2   source_id                   356804 non-null  object  
 3   council_district            356804 non-null  int64   
 4   resolution_days_due         356804 non-null  int64   
 5   level_of_delay              356804 non-null  category
 6   district_1                  356804 non-null  uint8   
 7   district_2                  356804 non-null  uint8   
 8   district_3                  356804 non-null  uint8   
 9   district_4                  356804 non-null  uint8   
 10  district_5                  356804 non-null  uint8   
 11  district_6                  356804 non-null  uint8   
 12  district_7                  356804 non-null  uint8   
 1

#### Drop original dept, source_id, and call_reason

In [6]:
df.drop(columns=['dept', 
                 'call_reason', 
                 'source_id' ], inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 40 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   council_district            356804 non-null  int64   
 1   resolution_days_due         356804 non-null  int64   
 2   level_of_delay              356804 non-null  category
 3   district_1                  356804 non-null  uint8   
 4   district_2                  356804 non-null  uint8   
 5   district_3                  356804 non-null  uint8   
 6   district_4                  356804 non-null  uint8   
 7   district_5                  356804 non-null  uint8   
 8   district_6                  356804 non-null  uint8   
 9   district_7                  356804 non-null  uint8   
 10  district_8                  356804 non-null  uint8   
 11  district_9                  356804 non-null  uint8   
 12  district_10                 356804 non-null  uint8   
 1

#### Splitty Split

In [8]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, train_scaled, validate_scaled, test_scaled = m.split_separate_scale(df, stratify_by= 'level_of_delay')

In [9]:
train.shape

(199810, 40)

In [10]:
validate.shape

(85633, 40)

In [11]:
test.shape

(71361, 40)

In [12]:
train.head()

Unnamed: 0,council_district,resolution_days_due,level_of_delay,district_1,district_2,district_3,district_4,district_5,district_6,district_7,...,misc,storm,streets,trades,traffic,waste,web_portal,311_mobile_app,constituent_call,internal_services_requests
196649,5,4,Early Response,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
192441,10,8,Very Early Response,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
119552,6,3,Very Early Response,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
336354,10,8,Early Response,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
59110,10,65,Late Response,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356804 entries, 551 to 399979
Data columns (total 40 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   council_district            356804 non-null  int64   
 1   resolution_days_due         356804 non-null  int64   
 2   level_of_delay              356804 non-null  category
 3   district_1                  356804 non-null  uint8   
 4   district_2                  356804 non-null  uint8   
 5   district_3                  356804 non-null  uint8   
 6   district_4                  356804 non-null  uint8   
 7   district_5                  356804 non-null  uint8   
 8   district_6                  356804 non-null  uint8   
 9   district_7                  356804 non-null  uint8   
 10  district_8                  356804 non-null  uint8   
 11  district_9                  356804 non-null  uint8   
 12  district_10                 356804 non-null  uint8   
 1

#### Create the baseline

In [14]:
#baseline
train.level_of_delay.value_counts()

Very Early Response    114280
Early Response          36971
Late Response           33599
On Time Response        11753
Very Late Response       3207
Name: level_of_delay, dtype: int64

In [29]:
#baseline accuracy will be early response
baseline = round((train.level_of_delay == 'Very Early Response').mean(), 2) *100

print(f'The baseline accuracy is: {baseline} %')

The baseline accuracy is: 56.99999999999999 %


#### Decision Tree

In [51]:
#make the thing
clf = DecisionTreeClassifier(max_depth=8, random_state=123)
#fit the thing
clf = clf.fit(X_train, y_train)
#predicitons
y_pred = clf.predict(X_train)
#probability
y_pred_proba = clf.predict_proba(X_train)

#### Evaluate Decision Tree

In [52]:
#compute the accuracy 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.66


In [53]:
#classification report
print(classification_report(y_train, y_pred))

                     precision    recall  f1-score   support

     Early Response       0.59      0.28      0.38     36971
      Late Response       0.68      0.27      0.39     33599
   On Time Response       0.50      0.00      0.00     11753
Very Early Response       0.66      0.97      0.79    114280
 Very Late Response       1.00      0.49      0.66      3207

           accuracy                           0.66    199810
          macro avg       0.68      0.40      0.44    199810
       weighted avg       0.65      0.66      0.60    199810



In [54]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.66


In [55]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

                     precision    recall  f1-score   support

     Early Response       0.58      0.28      0.38     15845
      Late Response       0.67      0.27      0.38     14399
   On Time Response       0.00      0.00      0.00      5037
Very Early Response       0.66      0.97      0.79     48978
 Very Late Response       0.99      0.50      0.66      1374

           accuracy                           0.66     85633
          macro avg       0.58      0.40      0.44     85633
       weighted avg       0.62      0.66      0.60     85633



In [35]:
train.columns.to_list()

['council_district',
 'resolution_days_due',
 'level_of_delay',
 'district_1',
 'district_2',
 'district_3',
 'district_4',
 'district_5',
 'district_6',
 'district_7',
 'district_8',
 'district_9',
 'district_10',
 'animal_care_services',
 'code_enforcement_services',
 'customer_services',
 'development_services',
 'metro_health',
 'parks_and_rec',
 'solid_waste_management',
 'trans_and_cap_improvements',
 'unknown_dept',
 'buildings',
 'business',
 'cleanup',
 'code',
 'customer_service',
 'field',
 'land',
 'license',
 'misc',
 'storm',
 'streets',
 'trades',
 'traffic',
 'waste',
 'web_portal',
 '311_mobile_app',
 'constituent_call',
 'internal_services_requests']

#### Vanilla KNN

In [65]:
#Features I want to train on
features1 = ['resolution_days_due', 'customer_services', 'development_services', 'solid_waste_management', 'customer_service', 'field', '311_mobile_app']
#weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=6, weights='uniform')
#fit model
knn.fit(X_train[features1], y_train)

KNeighborsClassifier(n_neighbors=6)

In [None]:
#mkae predictions
y_pred_knn = knn.predict(X_train[features1])
#estimate probability
y_pred_proba = knn.predict_proba(X_train[features1])

In [64]:
#accuracy
accuracy = knn.score(X_train[features1], y_train)
print(f"KNN Accuracy is {accuracy:.5}")

KNN Accuracy is 0.59081


In [61]:
#classification report
#confustion matrix
print(classification_report(y_train, y_pred_knn))

                     precision    recall  f1-score   support

     Early Response       0.32      0.44      0.37     36971
      Late Response       0.55      0.35      0.43     33599
   On Time Response       0.15      0.01      0.01     11753
Very Early Response       0.71      0.77      0.74    114280
 Very Late Response       1.00      0.48      0.65      3207

           accuracy                           0.59    199810
          macro avg       0.55      0.41      0.44    199810
       weighted avg       0.58      0.59      0.57    199810



#### Validate KNN

In [62]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate[features1], y_validate)))


Accuracy of KNN classifier on test set: 0.59


#### Chocolate KNN

In [None]:
#Features I want to train on
features1 = ['council_district',
 'resolution_days_due',
 'district_1',
 'district_2',
 'district_3',
 'district_4',
 'district_5',
 'district_6',
 'district_7',
 'district_8',
 'district_9',
 'district_10',
 'animal_care_services',
 'code_enforcement_services',
 'customer_services',
 'development_services',
 'metro_health',
 'parks_and_rec',
 'solid_waste_management',
 'trans_and_cap_improvements',
 'unknown_dept',
 'buildings',
 'business',
 'cleanup',
 'code',
 'customer_service',
 'field',
 'land',
 'license',
 'misc',
 'storm',
 'streets',
 'trades',
 'traffic',
 'waste',
 'web_portal',
 '311_mobile_app',
 'constituent_call',
 'internal_services_requests']
#make model
#weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
#fit model
knn.fit(X_train[features1], y_train)

In [None]:
#make predictions
y_pred_knn = knn.predict(X_train[features1])
#estimate probability
y_pred_proba = knn.predict_proba(X_train[features1])

In [None]:
#accuracy
accuracy = knn.score(X_train[features1], y_train)
print(f"KNN Accuracy is {accuracy:.5}")

In [None]:
#classification report
#confustion matrix
print(classification_report(y_train, y_pred_knn))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate[features1], y_validate)))

### New Feature to Create
- sq ft by days late

- sq miles gathered from:
    https://sa2020.org/city-council-profiles

In [None]:
df = w.clean_311(w.get_311_data())

In [None]:
df.shape

In [None]:
df.days_before_or_after_due.value_counts()[:15]

In [None]:
df.info()

In [None]:
df.council_district.value_counts()

In [None]:
# convert square miles to square feet
# 1 sq mi = 27,878,399.996383 sq ft
# maybe dont convert it to sqft bc its so big
d1 = 26.00
d2 = 59.81
d3 = 116.15
d4 = 65.21
d5 = 22.24
d6 = 38.44
d7 = 32.82
d8 = 71.64
d9 = 48.71
d10 = 55.62

In [None]:
df = df[df.council_district != 0]

In [None]:
df.council_district.value_counts()

In [None]:
df.shape

In [None]:
def get_sq_miles(council_district):
    """
    This function will apply the square miles per district
    to each district.
    """
    d1 = 26.00
    d2 = 59.81
    d3 = 116.15
    d4 = 65.21
    d5 = 22.24
    d6 = 38.44
    d7 = 32.82
    d8 = 71.64
    d9 = 48.71
    d10 = 55.62
    if council_district == 1:
        return d1
    elif council_district == 2:
        return d2
    elif council_district == 3:
        return d3
    elif council_district == 4:
        return d4
    elif council_district == 5:
        return d5
    elif council_district == 6:
        return d6
    elif council_district == 7:
        return d7
    elif council_district == 8:
        return d8
    elif council_district == 9:
        return d9
    else:
        return d10
    
def sq_miles_by_days(df):
    """
    This function takes in one positional argument:
    1.  311 df
    This function returns:
    1. square mile per district
    2. the amount of days before or after due per square mile
    """
    # get square miles for each district
    df['sq_miles'] = df['council_district'].apply(get_sq_miles)
    # get the amount of days per square mile
    df['sq_miles_by_days'] = df.sq_miles / df.days_before_or_after_due
    return df

In [None]:
df['sq_miles'] = df['council_district'].apply(get_sq_miles)

In [None]:
df.sq_miles.value_counts()

In [None]:
df['sq_miles_by_days'] = df.sq_miles / df.days_before_or_after_due

In [None]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
df.head()

In [None]:
d1_avg_days_per_sq_mile = df[df.]
waste = train[train.call_reason == 'waste']

In [None]:
df2 = w.clean_311(w.get_311_data())

In [None]:
df = sq_miles_by_days(df)

In [None]:
df.head()

In [None]:
df.days_before_or_after_due.value_counts()[:10]