In [12]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
import datetime

In [5]:
%run ml_helpers.ipynb

In [9]:
data = pd.read_csv("../intermediate_data/high_crime_labeled.csv")
data = data.drop(columns=["beat", "beat_num", "Crimes", "Arrest", "Domestic", "Serious"])
data = convert_to_categorical(data, ["district", "sector",
                                    "Month",
                                     "Watch", "Beat"])
data.drop(columns = ["Unnamed: 0"], inplace=True)

In [10]:
data.head()

Unnamed: 0,district,sector,Year,Month,Watch,Beat,count_l_stops,count_bus_stops,count_metra_stops,count_restaurants,...,count_businesses,road_distance_ft,TOTAL POPULATION,dist_to_police,dist_to_hospital,SNOW,TMAX,TMIN,high_crime,high_crime_geog_lag
0,17,1,2015,1,First,1713,3.0,26.0,0.0,63.0,...,92.0,169276.405792,13283.675264,5454.06889,3258.006066,0.214286,33.785714,24.642857,0.0,0.139442
1,17,1,2015,1,Second,1713,3.0,26.0,0.0,63.0,...,92.0,169276.405792,13283.675264,5454.06889,3258.006066,0.344444,32.65,23.0,0.0,0.139442
2,17,1,2015,1,Third,1713,3.0,26.0,0.0,63.0,...,92.0,169276.405792,13283.675264,5454.06889,3258.006066,0.4,31.8125,21.3125,0.0,0.139442
3,17,1,2015,2,First,1713,3.0,26.0,0.0,63.0,...,92.0,169276.405792,13283.675264,5454.06889,3258.006066,0.32,24.3,8.4,0.0,0.139442
4,17,1,2015,2,Second,1713,3.0,26.0,0.0,63.0,...,92.0,169276.405792,13283.675264,5454.06889,3258.006066,0.2,22.666667,9.083333,0.0,0.139442


In [34]:
models = {
    'GaussianNB': GaussianNB(),
    'LinearSVC': LinearSVC()    
}

NBgrid = {'GaussianNB': [{'priors': None}] 
                         #for x in ('None')]
         }

SVCgrid = {'LinearSVC': [{'C': x, 'random_state': 0} for x in (0.01, 0.1, 1, 10, 100)]}

In [15]:
data_list = prep_data(data, "high_crime",
                                        2, "Year", ["district", "sector", 
                                                    "Month", 
                                                    "Beat", "Watch"])

      district sector  Year Month   Watch  Beat  count_l_stops  \
29557       17      1  2018     1   First  1713            3.0   
29558       17      1  2018     1  Second  1713            3.0   
29559       17      1  2018     1   Third  1713            3.0   
29560       17      1  2018     2   First  1713            3.0   
29561       17      1  2018     2  Second  1713            3.0   
...        ...    ...   ...   ...     ...   ...            ...   
49256        3      1  2019    11  Second   312            2.0   
49257        3      1  2019    11   Third   312            2.0   
49258        3      1  2019    12   First   312            2.0   
49259        3      1  2019    12  Second   312            2.0   
49260        3      1  2019    12   Third   312            2.0   

       count_bus_stops  count_metra_stops  count_restaurants  ...  \
29557             26.0                0.0               63.0  ...   
29558             26.0                0.0               63.0  ...   


Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2016 2017]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2015 2016]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


In [31]:
nb_results = average_grid_searches(GaussianNB(), 'GaussianNB', NBgrid, data_list, "high_crime", 2020)

test year is: 2020
[2016, 2017, 2018, 2019]


In [32]:
nb_results

Unnamed: 0,parameters_2020,precision_2020,parameters_2019,precision_2019,parameters_2018,precision_2018,parameters_2017,precision_2017,mean
0,{'priors': None},0.353127,{'priors': None},0.380673,{'priors': None},0.349891,{'priors': None},0.364487,0.362045


In [35]:
svc_results = average_grid_searches(LinearSVC(), 'LinearSVC', SVCgrid, data_list, "high_crime", 2020)

test year is: 2020
[2016, 2017, 2018, 2019]


In [36]:
svc_results

Unnamed: 0,parameters_2020,precision_2020,parameters_2019,precision_2019,parameters_2018,precision_2018,parameters_2017,precision_2017,mean
0,"{'C': 1, 'random_state': 0}",0.664671,"{'C': 1, 'random_state': 0}",1.0,"{'C': 1, 'random_state': 0}",1.0,"{'C': 1, 'random_state': 0}",0.266772,0.732861
0,"{'C': 0.01, 'random_state': 0}",0.888889,"{'C': 0.01, 'random_state': 0}",0.68382,"{'C': 0.01, 'random_state': 0}",1.0,"{'C': 0.01, 'random_state': 0}",0.282323,0.713758
0,"{'C': 0.1, 'random_state': 0}",0.241702,"{'C': 0.1, 'random_state': 0}",0.855696,"{'C': 0.1, 'random_state': 0}",1.0,"{'C': 0.1, 'random_state': 0}",0.266828,0.591057
0,"{'C': 10, 'random_state': 0}",0.664671,"{'C': 10, 'random_state': 0}",1.0,"{'C': 10, 'random_state': 0}",0.245893,"{'C': 10, 'random_state': 0}",0.266744,0.544327
0,"{'C': 100, 'random_state': 0}",0.664671,"{'C': 100, 'random_state': 0}",1.0,"{'C': 100, 'random_state': 0}",0.245893,"{'C': 100, 'random_state': 0}",0.266744,0.544327
