In [1]:
#Helpers to Run ML Algorithms

In [2]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime



In [3]:
def fill_missing_mean(df_train, df_to_fill):
    '''
    A simple function that fills missing values of continuous columns 
        with the column median
    Inputs:
        df_train (df): the training df. Function computes means from this value AND fills this 
        df_to_fill (df): the df whose continuous NAs should be filled 
    Returns:
        df_train (df): the training dataset
        df_to_fill: the testing dataset with its data filled by the training data median

    '''
    df_train_num = df_train.select_dtypes(include=[np.number])
    #means = df_train_num.mean().to_dict()
    mean_dict = {}
    
    
    
    for col in df_train_num.columns:
        mean_dict[col] = df_train[col].mean()
    
    df_train.fillna(value=mean_dict, inplace=True)
    df_to_fill.fillna(value=mean_dict, inplace=True)
    
    #df_train = df_train.fillna(value=means)
    #df_to_fill = df_to_fill.fillna(value=means)
    print("Finished filling NAs with mean...")
    return df_train, df_to_fill

In [4]:
#Testing above function

#df_train = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[1, 2, 3, 4, 5, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_to_fill = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[10, 20, 33, 43, 53, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_train, df_to_fill = fill_missing_mean(df_train, df_to_fill)
#df_to_fill
#df_train

In [5]:
def normalize_continuous(df, scaler = None):
    '''
    A simple function that normalizes the values of of continuous columns 
        using data from the training set
    Inputs:
        df (df): either the training or the testing df
        scaler: the scaler object. It will be None for training and exist for testing 
    Returns:
        df (df): the standardized df
        scaler: the scaler object
    '''
    df["Year"]=df["Year"].astype("category")
    if scaler is None: #Training case
        scaler = sk.preprocessing.StandardScaler() #Set up scaler
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.fit_transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing training data")
    else: #Testing case
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing test data...")
    return df, scaler

In [6]:
def one_hot_encode(df, cat_vars): 
    '''
    A function to one-hot encode given categorical variables
    Inputs:
        df (df): a pandas dataframe
        cat_vars (list of strings): a list of the categorical variables to one-hot encode
    '''

    df = pd.get_dummies(df, columns = cat_vars)
    print("Finished one-hot encoding...")
    return df

In [7]:
def standardize_columns(train, test):
    '''
    A function to ensure that training and testing data have identical columns
    after one-hot encoding
    If a column is in training but not testing, adds a column of 0s to testing
    If column is in testing but not training, it is removed
    Inputs:
        train (df): the training df
        test (df): the testing df
    Outputs:
        train, test (df): the datasets with identical columns
    '''
    train_cols = list(train.columns)
    test_cols = list(test.columns)
    
    for tr_col in train_cols:
        if tr_col not in test_cols:
            test[tr_col] = 0
    
    for test_col in test_cols:
        if test_col not in train_cols:
            test = test.drop(test_col, axis=1)
    print("Finished standardizing...")
    return (train, test)

In [8]:
def split_train_test_by_year(df, y, num_years, year_col):
    '''
    
    Ultimately, this function divides the dataset up into smaller chunks with a one year test set and a num_years 
        years worth of training data in the num_years years just previous to the test year
        
    Ex: we are always predicting 2020 but remove that to see results later.
        If num_years = 2 and the data runs from 2015 to 2020, this function creates data with:
            
            set 1:
                Train: 2015 and 2016
                Test: 2017
            set 2:
                Train: 2016 2017
                Test: 2018
            set 3:
                Train: 2017 and 2018
                Test: 2019
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        num_years(int): the number of years to be included in the training set
        year_col (str): the name of the column representing the years in the df
    Output:
   train_test_data_list (list of tuples):
       each tuple contains:
           df_train (DataFrame): includes num_years worth of data before the test year
           df_test (DataFrame): includes 1 year, the test year for this set of data
    '''
    year_list = df[year_col].unique()
    year_list = sorted(list(year_list), reverse=True)
    year_list

    train_test_data_list = []
    for year in year_list:
        if year - num_years in year_list:
            df_test = df.loc[df[year_col]==year]
            df_train = df.loc[(df[year_col] < year) & (df[year_col] >= year-num_years)]
            train_test_data_list.append((df_train, df_test, year))
            print(train_test_data_list[-1][0])
    train_test_data_list.pop(0)
    print("Finished splitting...")
    return train_test_data_list


In [25]:
def split_train_test_by_year_test(df, y, num_years, year_col):
    '''
    
    Ultimately, this function divides the dataset up into smaller chunks with a one year test set and a num_years 
        years worth of training data in the num_years years just previous to the test year
        
    Ex: we are always predicting 2020 but remove that to see results later.
        If num_years = 2 and the data runs from 2015 to 2020, this function creates data with:
            
            set 1:
                Train: 2015 and 2016
                Test: 2017
            set 2:
                Train: 2016 2017
                Test: 2018
            set 3:
                Train: 2017 and 2018
                Test: 2019
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        num_years(int): the number of years to be included in the training set
        year_col (str): the name of the column representing the years in the df
    Output:
   train_test_data_list (list of tuples):
       each tuple contains:
           df_train (DataFrame): includes num_years worth of data before the test year
           df_test (DataFrame): includes 1 year, the test year for this set of data
    '''
    year_list = df[year_col].unique()
    year_list = sorted(list(year_list), reverse=True)
    year_list

    train_test_data_list = []
    for year in year_list:
        if year - num_years in year_list:
            df_test = df.loc[df[year_col]==year]
            df_train = df.loc[(df[year_col] < year) & (df[year_col] >= year-num_years)]
            train_test_data_list.append((df_train, df_test, year))
            print(train_test_data_list[-1][0])
    print("Finished splitting...")
    return train_test_data_list

In [9]:
#Testing above;
#data_list = split_train_test_by_year(data, "was_arrested", 2, "Year")   
#for group in data_list:
#    print(group[0]["Year"].unique(), group[1]["Year"].unique())



In [18]:
def prep_data(df, y, num_years, year_col, vars_to_onehot):
    '''
    Helper function that aggregates the above helpers to prepare for imputation in 
    an ML algorithm. Specifically this:
        Splits the training set and testing set based on year 
            using split_train_test_by_year
        One-hot encodes and standardizes the columns using 
            one_hot_encode and standardize_column
        Normalizes all continuous variables using normalize_continuous
    Inputs:
    df (pandas DataFrame): the dataframe with training and testing data, predictors and target
    y (string): the name of the target column
    num_years (int): the number of years to be included in the training set
    test_year (int): the year we seek to predict 2015_2020
    
    Outputs:
    cleaned_train_test - a list of tuples. 
    The first tuple is the training dataframe and the second is the test for a given set of years
    '''
    df = convert_to_categorical(df, [y])
    cleaned_trained_test = []
    train_test_list = split_train_test_by_year_test(df, y, num_years, year_col)
    for year_set in train_test_list:
        print("Working on:", year_set[0]["Year"].unique())
        train_df = year_set[0]
        test_df = year_set[1]
        year = year_set[2]
        print("Have accessed train and test df...")
        train_df, test_df = fill_missing_mean(train_df, test_df)
        print("On to normalizing continuous...")
        train_df, scaler = normalize_continuous(train_df)
        test_df, doesnt_matter = normalize_continuous(test_df, scaler)
        train_df = one_hot_encode(train_df, vars_to_onehot)
        test_df = one_hot_encode(test_df, vars_to_onehot)
        train_df, test_df = standardize_columns(train_df, test_df)
        
        cleaned_trained_test.append((train_df, test_df, year))
        
    return cleaned_trained_test
    

In [38]:
def prep_data_test(df, y, num_years, year_col, vars_to_onehot):
    '''
    This function is identical to the above except that it does not remove the last set of data
    Thus, it should be used when computing final results of the best models
    '''
    df = convert_to_categorical(df, [y])
    cleaned_trained_test = []
    train_test_list = split_train_test_by_year_test(df, y, num_years, year_col)
    for year_set in train_test_list:
        print("Working on:", year_set[0]["Year"].unique())
        train_df = year_set[0]
        test_df = year_set[1]
        year = year_set[2]
        print("Have accessed train and test df...")
        train_df, test_df = fill_missing_mean(train_df, test_df)
        print("On to normalizing continuous...")
        train_df, scaler = normalize_continuous(train_df)
        test_df, doesnt_matter = normalize_continuous(test_df, scaler)
        train_df = one_hot_encode(train_df, vars_to_onehot)
        test_df = one_hot_encode(test_df, vars_to_onehot)
        train_df, test_df = standardize_columns(train_df, test_df)
        
        cleaned_trained_test.append((train_df, test_df, year))
        
    return cleaned_trained_test

In [19]:
def convert_to_categorical(df, cols_to_convert):
    '''
    Convert columns to categorical
    Inputs:
        df (pd.DataFrame): The Pandas df
        cols_to_convert (list of strings): The columns to convert    
    Output:
        df - the updated dataframe
    '''
    for col in cols_to_convert:
        df[col]=df[col].astype("category")

    return df


In [39]:
#Set up test data:
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")
#data.head(5)
data["was_arrested"]=data["Arrest"].astype("float")
data = data.drop("Arrest", axis = 1)
data = convert_to_categorical(data, ["Beat", "Month", "Watch"])
#data.head(5)

data_small = data.sample(frac=0.0001)
data_small

Unnamed: 0,ID,Domestic,Beat,Year,Month,Week,Day,Hour,Watch,PRCP,...,count_restaurants,count_bars,count_daycares,count_entertainment,count_businesses,road_distance_ft,TOTAL POPULATION,dist_to_police,dist_to_hospital,was_arrested
1437587,12097472,False,311,2020,7,27,5,16,Third,0.00,...,5.0,0.0,4.0,0.0,13.0,118279.695362,2869.786020,6964.436278,3119.927501,0.0
705858,11062326,False,2411,2017,8,33,20,14,Second,0.01,...,60.0,11.0,5.0,1.0,135.0,248031.980040,23472.617621,7919.169371,12814.863870,0.0
909526,11335425,True,324,2018,6,22,2,21,Third,0.32,...,23.0,1.0,4.0,1.0,49.0,238375.407163,7654.066578,4694.938466,2332.659722,0.0
39899,9983325,False,1711,2015,3,10,4,21,Third,0.00,...,67.0,13.0,9.0,2.0,212.0,373848.845834,17206.377123,8669.142958,9978.262290,0.0
539984,10811203,False,414,2017,1,2,9,13,Second,0.02,...,24.0,1.0,6.0,1.0,81.0,306431.456840,11696.405100,10134.881888,1840.066597,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135500,10147060,True,1011,2015,7,28,11,6,First,0.07,...,8.0,0.0,1.0,0.0,28.0,184361.538371,6950.898643,5806.339526,6067.398557,0.0
595061,10895628,False,832,2017,3,13,30,17,Third,2.19,...,28.0,0.0,3.0,0.0,53.0,164679.323684,8132.815409,7685.313880,2445.693889,0.0
1211868,11762459,False,113,2019,7,29,18,8,Second,1.25,...,91.0,23.0,1.0,8.0,212.0,46732.833914,1746.267203,6933.482574,6425.575978,0.0
67300,10029491,False,221,2015,4,15,12,16,Third,0.00,...,14.0,3.0,3.0,2.0,41.0,130491.672100,4211.570921,6909.922931,3920.519213,0.0


In [13]:
#data_small.dtypes

In [40]:
#Test the code! 
data_list  = prep_data_test(data_small, "was_arrested", 2, "Year", 
                                       ["Year"])

               ID  Domestic  Beat  Year Month  Week  Day  Hour   Watch  PRCP  \
909526   11335425      True   324  2018     6    22    2    21   Third  0.32   
1027435  11492599     False   411  2018    10    44   30    16   Third  0.56   
1287160  11873125     False   922  2019    10    43   25    15  Second  0.00   
1102172  11649184     False   624  2019     2     7   17    16   Third  0.13   
1182074  11718274     False   921  2019     6    24   10    17   Third  0.00   
992394   11454777     False  1633  2018     9    37   13    19   Third  0.00   
1308503  11904478     False   532  2019    11    48   26    18   Third  0.40   
883113   11300786     False  1134  2018     5    18    1     0   First  0.00   
1060238  11536643      True   421  2018    12    50   15    22   Third  0.00   
1097191  11591246      True  2522  2019     2     6   10     3   First  0.06   
1227632  11785007     False   823  2019     8    32    6    18   Third  0.27   
1155231  11678650     False   622  2019 

Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

In [41]:
data_list[0][1]

Unnamed: 0,ID,Domestic,Beat,Month,Week,Day,Hour,Watch,PRCP,SNOW,...,count_daycares,count_entertainment,count_businesses,road_distance_ft,TOTAL POPULATION,dist_to_police,dist_to_hospital,was_arrested,Year_2018,Year_2019
1437587,2.643287,False,311,7,-0.110808,-1.383191,0.262657,Third,-0.54549,-0.243592,...,0.520088,-0.40883,-1.017182,-1.068178,-1.07706,0.332178,-0.804307,0.0,0,0
1488744,3.039346,True,1011,9,0.692855,1.097532,-1.007334,Second,-0.54549,-0.243592,...,-0.780132,-0.40883,-0.809156,-0.038543,-0.341828,-0.009526,0.019926,0.0,0,0
1399519,2.369801,False,2514,5,-0.713556,-1.85571,-0.689836,Second,-0.54549,-0.243592,...,3.12053,0.14982,0.965995,0.789572,1.551995,-0.077936,-0.410961,0.0,0,0
1339188,1.913814,False,1931,1,-1.785107,-0.556283,-2.118575,First,-0.54549,-0.243592,...,1.386902,0.010157,1.132415,1.512339,0.508583,0.432421,0.284837,0.0,0,0
1357085,2.046678,True,621,2,-1.517219,-0.910673,-2.118575,First,-0.251353,0.209028,...,-0.346726,-0.40883,-0.531789,0.281366,-0.206444,-1.4912,1.053709,1.0,0,0
1468694,2.967707,False,1235,8,0.424967,0.861273,-0.848585,Second,-0.54549,-0.243592,...,-0.346726,0.568806,2.075464,2.956882,0.17632,-0.625342,0.499087,0.0,0,0
1407649,2.427568,False,2033,5,-0.51264,0.152495,-0.213589,Second,-0.54549,-0.243592,...,0.953495,0.568806,-0.115738,-0.955401,-0.525405,0.833017,-1.130851,0.0,0,0
1332204,1.862712,False,214,1,-1.852079,-1.85571,-1.642329,First,-0.54549,-0.243592,...,-0.346726,-0.129505,-0.531789,-0.107352,-0.369592,0.083106,0.225227,0.0,0,0
1448799,2.729171,False,334,7,0.090108,0.743143,1.056401,Third,-0.54549,-0.243592,...,0.953495,-0.40883,-0.43471,-0.842563,-0.302895,1.728723,-0.21918,0.0,0,0
1414862,2.539861,False,123,5,-0.445668,1.570051,1.21515,Third,-0.54549,-0.243592,...,0.953495,2.943066,1.742623,-0.6291,0.84192,-0.312993,0.753378,0.0,0,0


In [16]:
def split_X_y(data, y):
    '''
    Function that separates predictors from target
    Inputs:
        data (DF): dataframe
        y (string): the name of the data column
    Outputs:
        X_ - df with predictor data
        y_ - df with target data
    '''
    y_ = data[y]
    X_ = data.drop(y, axis=1)
    
    return X_, y_ 

In [17]:
#X_train, y_train = split_x_y(data, "Arrest")
#y_train