### Logistic Regression

#### Import Packages

In [33]:
%run ml_helpers.ipynb
%run ml.ipynb

         Unnamed: 0  Beat  Year  Month   Watch  PRCP  SNOW  TMAX  TMIN  \
1327630     1327630   331  2019     12  Second  0.00   0.0    61    40   
1313404     1313404  2533  2019     12  Second  0.00   0.0    44    32   
1287559     1287559   124  2019     10  Second  3.45   0.0    51    39   
1152009     1152009  2222  2019      5   Third  0.33   0.0    49    43   
1173863     1173863   822  2019      5   Third  0.00   0.0    83    61   
...             ...   ...   ...    ...     ...   ...   ...   ...   ...   
1212594     1212594   723  2019      7  Second  0.00   0.0    95    81   
1321188     1321188  2532  2019     12   Third  0.01   0.3    27    20   
1259246     1259246  2222  2019      9  Second  0.00   0.0    79    66   
1242483     1242483  1731  2019      8   First  0.56   0.0    78    66   
1188505     1188505  1225  2019      6  Second  0.37   0.0    75    56   

         count_l_stops  ...  count_restaurants  count_bars  count_daycares  \
1327630            0.0  ...      

Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2015]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


In [17]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.decomposition import PCA
import datetime

#### Read and Process Data

In [37]:
data = pd.read_csv("../intermediate_data/high_crime_labeled.csv")
data = data.drop(columns=["beat", "beat_num", "Crimes", "Arrest", "Domestic", "Serious"])
data = convert_to_categorical(data, ["district", "sector",
                                    "Month",
                                     "Watch", "Beat"])
data.drop(columns = ["Unnamed: 0"], inplace=True)


In [38]:
data.dtypes

district               category
sector                 category
Year                      int64
Month                  category
Watch                  category
Beat                   category
TMAX                    float64
TMIN                    float64
PRCP                    float64
SNOW                    float64
high_crime              float64
count_l_stops           float64
count_bus_stops         float64
count_metra_stops       float64
count_restaurants       float64
count_bars              float64
count_daycares          float64
count_entertainment     float64
count_businesses        float64
road_distance_ft        float64
TOTAL POPULATION        float64
dist_to_police          float64
dist_to_hospital        float64
high_crime_geog_lag     float64
dtype: object

#### Setup ML

In [9]:
models = {
    'LogisticRegression': LogisticRegression(), 
}
grid = {
    'LogisticRegression': [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.001, 0.01, 0.1, 1, 10, 100, 1000)]
}

In [10]:
grid

{'LogisticRegression': [{'penalty': 'l2', 'C': 0.001, 'random_state': 0},
  {'penalty': 'l2', 'C': 0.01, 'random_state': 0},
  {'penalty': 'l2', 'C': 0.1, 'random_state': 0},
  {'penalty': 'l2', 'C': 1, 'random_state': 0},
  {'penalty': 'l2', 'C': 10, 'random_state': 0},
  {'penalty': 'l2', 'C': 100, 'random_state': 0},
  {'penalty': 'l2', 'C': 1000, 'random_state': 0},
  {'penalty': 'none', 'C': 0.001, 'random_state': 0},
  {'penalty': 'none', 'C': 0.01, 'random_state': 0},
  {'penalty': 'none', 'C': 0.1, 'random_state': 0},
  {'penalty': 'none', 'C': 1, 'random_state': 0},
  {'penalty': 'none', 'C': 10, 'random_state': 0},
  {'penalty': 'none', 'C': 100, 'random_state': 0},
  {'penalty': 'none', 'C': 1000, 'random_state': 0}]}

In [40]:
data_list = prep_data(data, "high_crime",
                                        2, "Year", ["district", "sector", 
                                                    "Month", 
                                                    "Beat", "Watch"])

data_list_pca = prep_data(data, "high_crime",
                                        2, "Year", 
                          ["district", "sector", 
                                                    "Month", 
                                                    "Beat", "Watch"], pca="yes!", 
                                                  columns_to_pca = ["TOTAL POPULATION", "dist_to_police",
                                                                   "dist_to_hospital", 
                                                  "count_l_stops", "count_bus_stops", 
                                               "count_metra_stops", "count_restaurants", "count_bars", 
                                               "count_daycares", "count_entertainment", "count_businesses", 
                                               "road_distance_ft"])

      district sector  Year Month   Watch  Beat       TMAX       TMIN  PRCP  \
29557       17      1  2018     1   First  1713  33.201426  18.884223  1.12   
29558       17      1  2018     1  Second  1713  33.201426  18.884223  1.12   
29559       17      1  2018     1   Third  1713  33.201426  18.884223  1.12   
29560       17      1  2018     2   Third  1713  39.791635  22.439919  5.47   
29561       17      1  2018     2   First  1713  39.791635  22.439919  5.47   
...        ...    ...   ...   ...     ...   ...        ...        ...   ...   
49256        3      1  2019    11  Second   312  43.761013  29.648049  1.70   
49257        3      1  2019    11   Third   312  43.761013  29.648049  1.70   
49258        3      1  2019    12  Second   312  42.647269  27.854899  1.29   
49259        3      1  2019    12   First   312  42.647269  27.854899  1.29   
49260        3      1  2019    12   Third   312  42.647269  27.854899  1.29   

       SNOW  ... count_restaurants  count_bars  cou

Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2017 2018]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2016 2017]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2015 2016]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
      district sector  Year Month   Watch  Beat 

Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.854452
1    0.854452
2    0.854452
3    0.854452
4    0.854452
Name: principal component 1, dtype: float64
(19704, 2) (19704, 322)
Working on: [2017 2018]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.852368
1    0.852368
2    0.852368
3    0.852368
4    0.852368
Name: principal component 1, dtype: float64
(19695, 2) (19695, 322)
Working on: [2016 2017]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.85269
1 

In [43]:
logistic_results_pca = average_grid_searches(LogisticRegression(), "LogisticRegression", 
                          grid, 
                          data_list_pca, "high_crime", 2020)
logistic_results = average_grid_searches(LogisticRegression(), "LogisticRegression", 
                          grid, 
                          data_list, "high_crime", 2020)

test year is: 2020
[2016, 2017, 2018, 2019]
test year is: 2020
[2016, 2017, 2018, 2019]


In [12]:
logistic_results.to_csv("../final_data/log_reg_results.csv")

In [46]:
logistic_results_pca

Unnamed: 0,parameters_2020,precision_2020,parameters_2019,precision_2019,parameters_2018,precision_2018,parameters_2017,precision_2017,mean
0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.665269,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.743573,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.72296,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.744444,0.719062
0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.649752,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.716374,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.70702,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.722826,0.698993
0,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.64107,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.704713,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.699248,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.710395,0.688857
0,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.64107,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.704713,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.699248,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.710395,0.688857
0,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.64107,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.704713,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.699248,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.710395,0.688857
0,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.64107,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.704713,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.699248,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.710395,0.688857
0,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.64107,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.704713,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.699248,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.710395,0.688857
0,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.643672,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.694959,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.700112,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.714117,0.688215
0,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.641817,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.69087,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.706462,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.702718,0.685467
0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.641565,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.69494,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.699281,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.702749,0.684634


In [44]:
logistic_results

Unnamed: 0,parameters_2020,precision_2020,parameters_2019,precision_2019,parameters_2018,precision_2018,parameters_2017,precision_2017,mean
0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.674018,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.743653,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.722716,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.745098,0.721371
0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.654747,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.723393,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.705056,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.73248,0.703919
0,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.641465,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.737089,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.705755,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.730567,0.703719
0,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.651051,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.714737,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.705338,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.725548,0.699168
0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.647702,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.713605,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.701998,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.730083,0.698347
0,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.648004,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.71421,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.69799,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.726841,0.696761
0,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.648004,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.71421,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.69799,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.726841,0.696761
0,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.648004,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.71421,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.69799,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.726841,0.696761
0,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.648004,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.71421,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.69799,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.726841,0.696761
0,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.648004,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.71421,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.69799,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.726841,0.696761
