In [1]:
#Helpers to Run ML Algorithms

In [45]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime

In [64]:
def fill_missing_mean(df_train, df_to_fill):
    '''
    A simple function that fills missing values of continuous columns 
        with the column median
    Inputs:
        df_train (df): the training df. Function computes means from this value AND fills this 
        df_to_fill (df): the df whose continuous NAs should be filled 
    Returns:
        df_train (df): the training dataset
        df_to_fill: the testing dataset with its data filled by the training data median

    '''
    df_train_num = df_train.select_dtypes(include=[np.number])
    means = df_train_num.mean().to_dict()
    
    df_train = df_train.fillna(value=means)
    df_to_fill = df_to_fill.fillna(value=means)
    print("Finished filling NAs with mean...")
    return df_train, df_to_fill

In [53]:
def normalize_continuous(df, scaler = None):
    '''
    A simple function that normalizes the values of of continuous columns 
        using data from the training set
    Inputs:
        df (df): either the training or the testing df
        scaler: the scaler object. It will be None for training and exist for testing 
    Returns:
        df (df): the standardized df
        scaler: the scaler object
    '''
    if scaler is None: #Training case
        scaler = sk.preprocessing.StandardScaler() #Set up scaler
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.fit_transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing training data")
    else: #Testing case
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing test data...")
    return df, scaler

In [54]:
def one_hot_encode(df, cat_vars): 
    '''
    A function to one-hot encode given categorical variables
    Inputs:
        df (df): a pandas dataframe
        cat_vars (list of strings): a list of the categorical variables to one-hot encode
    '''

    df = pd.get_dummies(df, columns = cat_vars)
    print("finished one-hot encoding...")
    return df

In [55]:
def standardize_columns(train, test):
    '''
    A function to ensure that training and testing data have identical columns
    after one-hot encoding
    If a column is in training but not testing, adds a column of 0s to testing
    If column is in testing but not training, it is removed
    Inputs:
        train (df): the training df
        test (df): the testing df
    Outputs:
        train, test (df): the datasets with identical columns
    '''
    train_cols = list(train.columns)
    test_cols = list(test.columns)
    
    for tr_col in train_cols:
        if tr_col not in test_cols:
            test[tr_col] = 0
    
    for test_col in test_cols:
        if test_col not in train_cols:
            test = test.drop(test_col, axis=1)
    print("finished standardizing...")
    return (train, test)

In [56]:
def split_train_test_by_year(df, y, test_year, num_years):
    '''
    isin syntax from: https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        test_year(int): the year we seek to predict in 2016-2020
        num_years(int): the number of years to be included in the training set

    Output:
    train_df (df): A training dataframe
    test_X (df): Testing df of predictors
    test_y (df): Testing target
    '''
    year_range = np.arange(test_year - num_years, test_year)
    train_filter = df.Year.isin(year_range)
    train_df = df[train_filter]
    train_X = train_df.drop(columns=[y])
    train_y = train_df[y]
    
    test_df = df[df.Year==test_year]
    test_X = test_df.drop(columns = [y])
    test_y = test_df[y]
    print("finished splitting by year...")
    return train_X, train_y, test_X, test_y

In [71]:
def prep_data(df, y, test_year, num_years, vars_to_onehot):
    '''
    Helper function that aggregates the above helpers to prepare for imputation in 
    an ML algorithm. Specifically this:
        Splits the training set and testing set based on year 
            using split_train_test_by_year
        One-hot encodes and standardizes the columns using 
            one_hot_encode and standardize_column
        Normalizes all continuous variables using normalize_continuous
    Inputs:
    df (pandas DataFrame): the dataframe with training and testing data, predictors and target
    y (string): the name of the target column
    num_years (int): the number of years to be included in the training set
    test_year (int): the year we seek to predict 2015_2020
    
    Outputs:
    train_df (df): a standardized training set with one-hot encorded categorical columns
    test_df (df): the test dataframe, again standardized as above
    test_y (Series): the test target
    '''
    train_df, train_y, test_df, test_y = split_train_test_by_year(df, y, test_year, num_years)
    train_df, test_df = standardize_columns(train_df, test_df)
    train_df, test_df = fill_missing_mean(train_df, test_df)
    train_df = one_hot_encode(train_df, vars_to_onehot)
    #test_df = one_hot_encode(test_df, vars_to_onehot)
    train_df, scaler = normalize_continuous(train_df)
    test_df, doesnt_matter = normalize_continuous(test_df, scaler)
    return train_df, train_y, test_df, test_y
    

In [7]:
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")

In [70]:
data.dtypes

ID                       int64
Arrest                    bool
Domestic                  bool
Beat                     int64
Year                     int64
Month                    int64
Week                     int64
Day                      int64
Hour                     int64
Watch                   object
PRCP                   float64
SNOW                   float64
TMAX                     int64
TMIN                     int64
category_1              object
category_2              object
count_l_stops          float64
count_bus_stops        float64
count_metra_stops      float64
count_restaurants      float64
count_bars             float64
count_daycares         float64
count_entertainment    float64
count_businesses       float64
road_distance_ft       float64
TOTAL POPULATION       float64
dist_to_police         float64
dist_to_hospital       float64
dtype: object

In [61]:
train_df, train_y, test_df, test_y = split_train_test_by_year(data, "Arrest", 2017, 2)



finished splitting by year...


In [72]:
train_X, train_y, test_X, test_y = prep_data(data, "Arrest", 2017, 2, 
                                       ["Year", "Month", "Week", "Beat"])

finished splitting by year...
finished standardizing...
Filling NAs with mean...
finished one-hot encoding
finished one-hot encoding
Finished normalizing training data


ValueError: X has 358 features, but this StandardScaler is expecting 360 features as input.

In [69]:
train_X.describe()

Unnamed: 0,ID,Day,Hour,PRCP,SNOW,TMAX,TMIN,count_l_stops,count_bus_stops,count_metra_stops,...,Beat_2521,Beat_2522,Beat_2523,Beat_2524,Beat_2525,Beat_2531,Beat_2532,Beat_2533,Beat_2534,Beat_2535
count,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,...,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0,534090.0
mean,-5.833968e-15,-2.426935e-14,1.807149e-14,-6.361288e-14,-1.527197e-14,-1.510259e-14,1.499653e-13,-8.963044e-14,-1.00528e-15,-7.794098e-15,...,2.239702e-15,3.640538e-15,-2.780333e-15,-5.654102e-15,2.783987e-15,-4.568257e-14,-7.289202e-15,4.937708e-15,2.271241e-14,-3.03576e-15
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-17.96652,-1.653279,-1.993694,-0.377997,-0.1358532,-2.747795,-2.873175,-0.5721857,-2.123496,-0.4499297,...,-0.0660085,-0.05551563,-0.05380998,-0.05063773,-0.04773053,-0.05908434,-0.06753623,-0.07763771,-0.07228266,-0.06146241
25%,-0.355726,-0.8633741,-0.6294804,-0.377997,-0.1358532,-0.7970072,-0.7117863,-0.5721857,-0.7038846,-0.4499297,...,-0.0660085,-0.05551563,-0.05380998,-0.05063773,-0.04773053,-0.05908434,-0.06753623,-0.07763771,-0.07228266,-0.06146241
50%,0.03738473,0.03937406,0.1284163,-0.377997,-0.1358532,0.1783866,0.02624904,-0.5721857,-0.3063934,-0.4499297,...,-0.0660085,-0.05551563,-0.05380998,-0.05063773,-0.04773053,-0.05908434,-0.06753623,-0.07763771,-0.07228266,-0.06146241
75%,0.4263588,0.8292787,0.886313,-0.2190015,-0.1358532,0.8611624,0.9224348,0.6704085,0.5453734,-0.4499297,...,-0.0660085,-0.05551563,-0.05380998,-0.05063773,-0.04773053,-0.05908434,-0.06753623,-0.07763771,-0.07228266,-0.06146241
max,3.472606,1.732027,1.49263,11.92825,24.25735,1.543938,1.555037,4.398191,3.441381,5.954221,...,15.14956,18.01294,18.58391,19.74812,20.95095,16.92496,14.80687,12.88034,13.83458,16.27011
