In [6]:
#Helpers to Run ML Algorithms

In [7]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime

In [49]:
def fill_missing_mean(df_train, df_to_fill):
    '''
    df['value'] = df['value'].fillna(df.groupby('name')['value'].transform('mean'))

    A simple function that fills missing values of continuous columns 
        with the column median
    Inputs:
        df_train (df): the training df. Function computes means from this value AND fills this 
        df_to_fill (df): the df whose continuous NAs should be filled 
    Returns:
        df_train (df): the training dataset
        df_to_fill: the testing dataset with its data filled by the training data median

    '''
    df_train_num = df_train.select_dtypes(include=[np.number])
    #means = df_train_num.mean().to_dict()
    
    for col in df_train_num.columns:
        df_train[col] = df_train.groupby("Year").transform(lambda x: x.fillna(x.mean()))
        df_to_fill[col] = df_to_fill.groupby("Year").transform(lambda x: x.fillna(x.mean()))
    
    #df_train = df_train.fillna(value=means)
    #df_to_fill = df_to_fill.fillna(value=means)
    print("Finished filling NAs with mean...")
    return df_train, df_to_fill

In [53]:
#Testing above function
#df_train = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[1, 2, 3, 4, 5, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_to_fill = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], 'Year': ['A','A', 'A','A','A','A', 'A','A','A']})
#df_train, df_to_fill = fill_missing_mean(df_train, df_to_fill)
#df_to_fill
#df_train

Finished filling NAs with mean...


Unnamed: 0,value,value2,Year
0,1.0,1.0,A
1,1.0,1.0,A
2,2.0,2.0,B
3,2.0,2.0,B
4,3.0,3.0,B
5,1.0,1.0,B
6,3.0,3.0,C
7,3.0,3.0,C
8,3.0,3.0,C


In [9]:
def normalize_continuous(df, scaler = None):
    '''
    A simple function that normalizes the values of of continuous columns 
        using data from the training set
    Inputs:
        df (df): either the training or the testing df
        scaler: the scaler object. It will be None for training and exist for testing 
    Returns:
        df (df): the standardized df
        scaler: the scaler object
    '''
    if scaler is None: #Training case
        scaler = sk.preprocessing.StandardScaler() #Set up scaler
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.fit_transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing training data")
    else: #Testing case
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing test data...")
    return df, scaler

In [10]:
def one_hot_encode(df, cat_vars): 
    '''
    A function to one-hot encode given categorical variables
    Inputs:
        df (df): a pandas dataframe
        cat_vars (list of strings): a list of the categorical variables to one-hot encode
    '''

    df = pd.get_dummies(df, columns = cat_vars)
    print("finished one-hot encoding...")
    return df

In [11]:
def standardize_columns(train, test):
    '''
    A function to ensure that training and testing data have identical columns
    after one-hot encoding
    If a column is in training but not testing, adds a column of 0s to testing
    If column is in testing but not training, it is removed
    Inputs:
        train (df): the training df
        test (df): the testing df
    Outputs:
        train, test (df): the datasets with identical columns
    '''
    train_cols = list(train.columns)
    test_cols = list(test.columns)
    
    for tr_col in train_cols:
        if tr_col not in test_cols:
            test[tr_col] = 0
    
    for test_col in test_cols:
        if test_col not in train_cols:
            test = test.drop(test_col, axis=1)
    print("finished standardizing...")
    return (train, test)

In [25]:
def split_train_test_by_year(df, y, test_year, num_years):
    '''
    isin syntax from: https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        test_year(int): the year we seek to predict in 2016-2020
        num_years(int): the number of years to be included in the training set

    Output:
    train_df (df): A training dataframe
    test_X (df): Testing df of predictors
    test_y (df): Testing target
    '''
    year_range = np.arange(test_year - num_years, test_year)
    train_filter = df.Year.isin(year_range)
    train_df = df[train_filter]
    train_X = train_df.drop(columns=[y])
    print(train_df.columns)
    train_y = train_df[y]
    
    test_df = df[df.Year==test_year]
    test_X = test_df.drop(columns = [y])
    test_y = test_df[y]
    print("finished splitting by year...")
    return train_X, train_y, test_X, test_y

#split_train_test_by_year(data, "Arrest", 2017, 2)
data["Arrest"]

0          False
1           True
2          False
3          False
4          False
           ...  
1542559    False
1542560    False
1542561    False
1542562    False
1542563    False
Name: Arrest, Length: 1542564, dtype: bool

In [27]:
def prep_data(df, y, test_year, num_years, vars_to_onehot):
    '''
    Helper function that aggregates the above helpers to prepare for imputation in 
    an ML algorithm. Specifically this:
        Splits the training set and testing set based on year 
            using split_train_test_by_year
        One-hot encodes and standardizes the columns using 
            one_hot_encode and standardize_column
        Normalizes all continuous variables using normalize_continuous
    Inputs:
    df (pandas DataFrame): the dataframe with training and testing data, predictors and target
    y (string): the name of the target column
    num_years (int): the number of years to be included in the training set
    test_year (int): the year we seek to predict 2015_2020
    
    Outputs:
    train_df (df): a standardized training set with one-hot encorded categorical columns
    test_df (df): the test dataframe, again standardized as above
    test_y (Series): the test target
    '''
    train_df, train_y, test_df, test_y = split_train_test_by_year(df, y, test_year, num_years)
    train_df, test_df = fill_missing_mean(train_df, test_df)
    train_df, scaler = normalize_continuous(train_df)
    test_df, doesnt_matter = normalize_continuous(test_df, scaler)
    train_df = one_hot_encode(train_df, vars_to_onehot)
    test_df = one_hot_encode(test_df, vars_to_onehot)
    train_df, test_df = standardize_columns(train_df, test_df)
    return train_df, train_y, test_df, test_y
    

In [28]:
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")
data.head(5)

Unnamed: 0,ID,Arrest,Domestic,Beat,Year,Month,Week,Day,Hour,Watch,...,count_metra_stops,count_restaurants,count_bars,count_daycares,count_entertainment,count_businesses,road_distance_ft,TOTAL POPULATION,dist_to_police,dist_to_hospital
0,10225520,False,False,411,2015,1,1,1,0,First,...,2.0,16.0,0.0,1.0,0.0,58.0,231176.656022,5470.665022,7180.695576,2783.222325
1,11028448,True,True,1532,2015,1,1,1,0,First,...,0.0,14.0,2.0,1.0,0.0,24.0,162950.259395,6459.881637,5701.676947,6619.369443
2,10225760,False,False,2024,2015,1,1,1,0,First,...,0.0,36.0,7.0,2.0,0.0,50.0,84982.042393,11195.685856,9224.617641,989.984955
3,11242929,False,False,223,2015,1,1,1,0,First,...,0.0,21.0,3.0,0.0,1.0,30.0,132102.433573,7269.595612,5686.009943,1509.833687
4,10229179,False,False,214,2015,1,1,1,0,First,...,0.0,11.0,2.0,2.0,2.0,48.0,179945.426889,6796.78719,6120.28893,6801.558784


In [19]:
def convert_to_categorical(df, cols_to_convert):
    '''
    Convert columns to categorical
    Inputs:
        df (pd.DataFrame): The Pandas df
        cols_to_convert (list of strings): The columns to convert    
    Output:
        df - the updated dataframe
    '''
    for col in cols_to_convert:
        df[col]=df[col].astype("category")

    return df

data = convert_to_categorical(data, ["Beat", "Year", "Month", "Watch"])
data["Arrest_target"]=np.where(data["Arrest"] is True, 1, 0)

In [30]:
#Test the code! 
train_X, train_y, test_X, test_y = prep_data(data, "Arrest", 2017, 2, 
                                       ["Year", "Month", "Week", "Beat"])

Index(['ID', 'Arrest', 'Domestic', 'Beat', 'Year', 'Month', 'Week', 'Day',
       'Hour', 'Watch', 'PRCP', 'SNOW', 'TMAX', 'TMIN', 'category_1',
       'category_2', 'count_l_stops', 'count_bus_stops', 'count_metra_stops',
       'count_restaurants', 'count_bars', 'count_daycares',
       'count_entertainment', 'count_businesses', 'road_distance_ft',
       'TOTAL POPULATION', 'dist_to_police', 'dist_to_hospital'],
      dtype='object')
finished splitting by year...
Finished filling NAs with mean...
Finished normalizing training data
Finished normalizing test data...
finished one-hot encoding...
finished one-hot encoding...
finished standardizing...


In [None]:
train_X.describe() #Looking fresh :) 