In [6]:
#Helpers to Run ML Algorithms

In [7]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime

In [193]:
def fill_missing_mean(df_train, df_to_fill):
    '''
    A simple function that fills missing values of continuous columns 
        with the column median
    Inputs:
        df_train (df): the training df. Function computes means from this value AND fills this 
        df_to_fill (df): the df whose continuous NAs should be filled 
    Returns:
        df_train (df): the training dataset
        df_to_fill: the testing dataset with its data filled by the training data median

    '''
    df_train_num = df_train.select_dtypes(include=[np.number])
    #means = df_train_num.mean().to_dict()
    mean_dict = {}
    
    
    
    for col in df_train_num.columns:
        mean_dict[col] = df_train[col].mean()
    
    df_train.fillna(value=mean_dict, inplace=True)
    df_to_fill.fillna(value=mean_dict, inplace=True)
    
    #df_train = df_train.fillna(value=means)
    #df_to_fill = df_to_fill.fillna(value=means)
    print("Finished filling NAs with mean...")
    return df_train, df_to_fill

In [188]:
#Testing above function

#df_train = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[1, 2, 3, 4, 5, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_to_fill = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[10, 20, 33, 43, 53, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_train, df_to_fill = fill_missing_mean(df_train, df_to_fill)
#df_to_fill
#df_train

In [198]:
def normalize_continuous(df, scaler = None):
    '''
    A simple function that normalizes the values of of continuous columns 
        using data from the training set
    Inputs:
        df (df): either the training or the testing df
        scaler: the scaler object. It will be None for training and exist for testing 
    Returns:
        df (df): the standardized df
        scaler: the scaler object
    '''
    df["Year"]=df["Year"].astype("category")
    if scaler is None: #Training case
        scaler = sk.preprocessing.StandardScaler() #Set up scaler
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.fit_transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing training data")
    else: #Testing case
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing test data...")
    return df, scaler

In [133]:
def one_hot_encode(df, cat_vars): 
    '''
    A function to one-hot encode given categorical variables
    Inputs:
        df (df): a pandas dataframe
        cat_vars (list of strings): a list of the categorical variables to one-hot encode
    '''

    df = pd.get_dummies(df, columns = cat_vars)
    print("Finished one-hot encoding...")
    return df

In [134]:
def standardize_columns(train, test):
    '''
    A function to ensure that training and testing data have identical columns
    after one-hot encoding
    If a column is in training but not testing, adds a column of 0s to testing
    If column is in testing but not training, it is removed
    Inputs:
        train (df): the training df
        test (df): the testing df
    Outputs:
        train, test (df): the datasets with identical columns
    '''
    train_cols = list(train.columns)
    test_cols = list(test.columns)
    
    for tr_col in train_cols:
        if tr_col not in test_cols:
            test[tr_col] = 0
    
    for test_col in test_cols:
        if test_col not in train_cols:
            test = test.drop(test_col, axis=1)
    print("Finished standardizing...")
    return (train, test)

In [135]:
def split_train_test_by_year(df, y, num_years, year_col):
    '''
    
    Ultimately, this function divides the dataset up into smaller chunks with a one year test set and a num_years 
        years worth of training data in the num_years years just previous to the test year
        
    Ex: we are always predicting 2020 but remove that to see results later.
        If num_years = 2 and the data runs from 2015 to 2020, this function creates data with:
            
            set 1:
                Train: 2015 and 2016
                Test: 2017
            set 2:
                Train: 2016 2017
                Test: 2018
            set 3:
                Train: 2017 and 2018
                Test: 2019
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        num_years(int): the number of years to be included in the training set
        year_col (str): the name of the column representing the years in the df
    Output:
   train_test_data_list (list of tuples):
       each tuple contains:
           df_train (DataFrame): includes num_years worth of data before the test year
           df_test (DataFrame): includes 1 year, the test year for this set of data
           


    '''
    year_list = df[year_col].unique()
    year_list = sorted(list(year_list), reverse=True)
    year_list

    train_test_data_list = []
    for year in year_list:
        if year - num_years in year_list:
            df_test = df.loc[df[year_col]==year]
            df_train = df.loc[(df[year_col] < year) & (df[year_col] >= year-num_years)]
            train_test_data_list.append((df_train, df_test, 2, "Years"))
    train_test_data_list.pop(0)
    print("Finished splitting...")
    return train_test_data_list


In [115]:
data_list = split_train_test_by_year(data, "was_arrested", 2, "Year")   
for group in data_list:
    print(group[0]["Year"].unique(), group[1]["Year"].unique())



[2017 2018] [2019]
[2016 2017] [2018]
[2015 2016] [2017]


In [219]:
def prep_data(df, y, num_years, year_col, vars_to_onehot):
    '''
    Helper function that aggregates the above helpers to prepare for imputation in 
    an ML algorithm. Specifically this:
        Splits the training set and testing set based on year 
            using split_train_test_by_year
        One-hot encodes and standardizes the columns using 
            one_hot_encode and standardize_column
        Normalizes all continuous variables using normalize_continuous
    Inputs:
    df (pandas DataFrame): the dataframe with training and testing data, predictors and target
    y (string): the name of the target column
    num_years (int): the number of years to be included in the training set
    test_year (int): the year we seek to predict 2015_2020
    
    Outputs:
    train_df (df): a standardized training set with one-hot encorded categorical columns
    test_df (df): the test dataframe, again standardized as above
    test_y (Series): the test target
    '''
    df = convert_to_categorical(df, [y])
    cleaned_trained_test = []
    train_test_list = split_train_test_by_year(df, y, num_years, year_col)
    for year_set in train_test_list:
        print("Working on:", year_set[0]["Year"].unique())
        train_df = year_set[0]
        test_df = year_set[1]
        print("Have accessed train and test df...")
        train_df, test_df = fill_missing_mean(train_df, test_df)
        print("On to normalizing continuous...")
        train_df, scaler = normalize_continuous(train_df)
        test_df, doesnt_matter = normalize_continuous(test_df, scaler)
        train_df = one_hot_encode(train_df, vars_to_onehot)
        test_df = one_hot_encode(test_df, vars_to_onehot)
        train_df, test_df = standardize_columns(train_df, test_df)
        
        cleaned_trained_test.append((train_df, test_df))
        
    return cleaned_trained_test
    

In [None]:
def convert_to_categorical(df, cols_to_convert):
    '''
    Convert columns to categorical
    Inputs:
        df (pd.DataFrame): The Pandas df
        cols_to_convert (list of strings): The columns to convert    
    Output:
        df - the updated dataframe
    '''
    for col in cols_to_convert:
        df[col]=df[col].astype("category")

    return df


In [220]:
#Set up test data:
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")
data.head(5)
data["was_arrested"]=data["Arrest"].astype("float")
data = data.drop("Arrest", axis = 1)
data = convert_to_categorical(data, ["Beat", "Month", "Watch"])
data.head(5)

data_small = data.sample(frac=0.0001)


In [221]:
data_small.dtypes

ID                        int64
Domestic                   bool
Beat                   category
Year                      int64
Month                  category
Week                      int64
Day                       int64
Hour                      int64
Watch                  category
PRCP                    float64
SNOW                    float64
TMAX                      int64
TMIN                      int64
category_1               object
category_2               object
count_l_stops           float64
count_bus_stops         float64
count_metra_stops       float64
count_restaurants       float64
count_bars              float64
count_daycares          float64
count_entertainment     float64
count_businesses        float64
road_distance_ft        float64
TOTAL POPULATION        float64
dist_to_police          float64
dist_to_hospital        float64
was_arrested            float64
dtype: object

In [222]:
#Test the code! 
data_list  = prep_data(data_small, "was_arrested", 2, "Year", 
                                       ["Year"])

#df, y, num_years, year_col, vars_to_onehot

Finished splitting...
Working on: [2017 2018]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2017 2016]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
Working on: [2016 2015]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

In [223]:
data_list[0][1]

Unnamed: 0,ID,Domestic,Beat,Month,Week,Day,Hour,Watch,PRCP,SNOW,...,count_daycares,count_entertainment,count_businesses,road_distance_ft,TOTAL POPULATION,dist_to_police,dist_to_hospital,was_arrested,Year_2017,Year_2018
1266959,2.986795,False,931,9,0.882523,1.308384,-0.504832,Second,6.39457,-0.243651,...,-0.976901,-0.351662,-0.324278,-0.581913,0.521906,0.545324,2.096726,0.0,0,0
1175189,2.35278,False,1933,6,-0.273223,-1.934337,1.268163,Third,0.078265,-0.243651,...,0.737528,1.169816,0.09527,-0.632122,0.138228,-0.65332,-1.175088,0.0,0,0
1220758,2.668971,False,1824,7,0.338642,1.557824,-0.343651,Second,3.095008,-0.243651,...,-0.29113,0.789446,0.419131,-0.839999,2.654006,-0.647994,-0.449448,0.0,0,0
1291022,3.155096,False,2422,10,1.222448,1.807264,0.623437,Third,0.486782,3.312234,...,-0.29113,-0.161478,-0.456767,-1.438801,-0.195353,-0.026213,1.12272,0.0,0,0
1157323,2.230835,False,2422,5,-0.477178,-0.936577,0.9458,Third,3.220705,-0.243651,...,-0.29113,-0.161478,-0.456767,-1.438801,-0.195353,-0.026213,1.12272,0.0,0,0
1220307,3.111489,False,235,7,0.270657,1.433104,1.106981,Third,-0.581648,-0.243651,...,0.051756,0.1238,-0.419964,0.042455,0.267236,0.331347,-1.044268,0.0,0,0
1186603,2.432158,False,123,6,-0.137253,-0.063537,0.462256,Third,-0.204555,-0.243651,...,0.394642,1.930555,0.684109,-0.817077,0.913477,-0.543149,0.725584,0.0,0,0
1082248,1.69994,False,412,1,-1.564939,-0.063537,-0.021288,Second,-0.581648,-0.243651,...,2.794843,-0.25657,0.176235,1.092722,0.072326,0.969483,-0.125924,0.0,0,0
1229351,2.733723,False,632,8,0.406628,-1.061297,1.268163,Third,-0.581648,-0.243651,...,0.051756,-0.351662,-0.272755,0.361174,-0.144176,0.565266,1.106458,0.0,0,0
1303471,3.259685,False,1134,11,1.426403,0.310623,0.139893,Second,-0.45595,-0.243651,...,-0.976901,-0.351662,-0.699663,-0.879495,-1.569586,-1.406763,-0.977314,0.0,0,0
