In [1]:
#Helpers to Run ML Algorithms

In [2]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime



In [3]:
def fill_missing_mean(df_train, df_to_fill):
    '''
    A simple function that fills missing values of continuous columns 
        with the column median
    Inputs:
        df_train (df): the training df. Function computes means from this value AND fills this 
        df_to_fill (df): the df whose continuous NAs should be filled 
    Returns:
        df_train (df): the training dataset
        df_to_fill: the testing dataset with its data filled by the training data median

    '''
    df_train_num = df_train.select_dtypes(include=[np.number])
    #means = df_train_num.mean().to_dict()
    mean_dict = {}
    
    
    
    for col in df_train_num.columns:
        mean_dict[col] = df_train[col].mean()
    
    df_train.fillna(value=mean_dict, inplace=True)
    df_to_fill.fillna(value=mean_dict, inplace=True)
    
    #df_train = df_train.fillna(value=means)
    #df_to_fill = df_to_fill.fillna(value=means)
    print("Finished filling NAs with mean...")
    return df_train, df_to_fill

In [4]:
#Testing above function

#df_train = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[1, 2, 3, 4, 5, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_to_fill = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[10, 20, 33, 43, 53, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_train, df_to_fill = fill_missing_mean(df_train, df_to_fill)
#df_to_fill
#df_train

In [5]:
def normalize_continuous(df, scaler = None):
    '''
    A simple function that normalizes the values of of continuous columns 
        using data from the training set
    Inputs:
        df (df): either the training or the testing df
        scaler: the scaler object. It will be None for training and exist for testing 
    Returns:
        df (df): the standardized df
        scaler: the scaler object
    '''
    df["Year"]=df["Year"].astype("category")
    if scaler is None: #Training case
        scaler = sk.preprocessing.StandardScaler() #Set up scaler
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.fit_transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing training data")
    else: #Testing case
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing test data...")
    return df, scaler

In [6]:
def one_hot_encode(df, cat_vars): 
    '''
    A function to one-hot encode given categorical variables
    Inputs:
        df (df): a pandas dataframe
        cat_vars (list of strings): a list of the categorical variables to one-hot encode
    '''

    df = pd.get_dummies(df, columns = cat_vars)
    print("Finished one-hot encoding...")
    return df

In [7]:
def standardize_columns(train, test):
    '''
    A function to ensure that training and testing data have identical columns
    after one-hot encoding
    If a column is in training but not testing, adds a column of 0s to testing
    If column is in testing but not training, it is removed
    Inputs:
        train (df): the training df
        test (df): the testing df
    Outputs:
        train, test (df): the datasets with identical columns
    '''
    train_cols = list(train.columns)
    test_cols = list(test.columns)
    
    for tr_col in train_cols:
        if tr_col not in test_cols:
            test[tr_col] = 0
    
    for test_col in test_cols:
        if test_col not in train_cols:
            test = test.drop(test_col, axis=1)
    print("Finished standardizing...")
    return (train, test)

In [8]:
def split_train_test_by_year(df, y, num_years, year_col):
    '''
    
    Ultimately, this function divides the dataset up into smaller chunks with a one year test set and a num_years 
        years worth of training data in the num_years years just previous to the test year
        
    Ex: we are always predicting 2020 but remove that to see results later.
        If num_years = 2 and the data runs from 2015 to 2020, this function creates data with:
            
            set 1:
                Train: 2015 and 2016
                Test: 2017
            set 2:
                Train: 2016 2017
                Test: 2018
            set 3:
                Train: 2017 and 2018
                Test: 2019
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        num_years(int): the number of years to be included in the training set
        year_col (str): the name of the column representing the years in the df
    Output:
   train_test_data_list (list of tuples):
       each tuple contains:
           df_train (DataFrame): includes num_years worth of data before the test year
           df_test (DataFrame): includes 1 year, the test year for this set of data
           


    '''
    year_list = df[year_col].unique()
    year_list = sorted(list(year_list), reverse=True)
    year_list

    train_test_data_list = []
    for year in year_list:
        if year - num_years in year_list:
            df_test = df.loc[df[year_col]==year]
            df_train = df.loc[(df[year_col] < year) & (df[year_col] >= year-num_years)]
            train_test_data_list.append((df_train, df_test, year))
            print(train_test_data_list[-1][0])
    train_test_data_list.pop(0)
    print("Finished splitting...")
    return train_test_data_list


In [9]:
#Testing above;
#data_list = split_train_test_by_year(data, "was_arrested", 2, "Year")   
#for group in data_list:
#    print(group[0]["Year"].unique(), group[1]["Year"].unique())



In [18]:
def prep_data(df, y, num_years, year_col, vars_to_onehot):
    '''
    Helper function that aggregates the above helpers to prepare for imputation in 
    an ML algorithm. Specifically this:
        Splits the training set and testing set based on year 
            using split_train_test_by_year
        One-hot encodes and standardizes the columns using 
            one_hot_encode and standardize_column
        Normalizes all continuous variables using normalize_continuous
    Inputs:
    df (pandas DataFrame): the dataframe with training and testing data, predictors and target
    y (string): the name of the target column
    num_years (int): the number of years to be included in the training set
    test_year (int): the year we seek to predict 2015_2020
    
    Outputs:
    cleaned_train_test - a list of tuples. 
    The first tuple is the training dataframe and the second is the test for a given set of years
    '''
    df = convert_to_categorical(df, [y])
    cleaned_trained_test = []
    train_test_list = split_train_test_by_year(df, y, num_years, year_col)
    for year_set in train_test_list:
        print("Working on:", year_set[0]["Year"].unique())
        train_df = year_set[0]
        test_df = year_set[1]
        year = year_set[2]
        print("Have accessed train and test df...")
        train_df, test_df = fill_missing_mean(train_df, test_df)
        print("On to normalizing continuous...")
        train_df, scaler = normalize_continuous(train_df)
        test_df, doesnt_matter = normalize_continuous(test_df, scaler)
        train_df = one_hot_encode(train_df, vars_to_onehot)
        test_df = one_hot_encode(test_df, vars_to_onehot)
        train_df, test_df = standardize_columns(train_df, test_df)
        
        cleaned_trained_test.append((train_df, test_df, year))
        
    return cleaned_trained_test
    

In [19]:
def convert_to_categorical(df, cols_to_convert):
    '''
    Convert columns to categorical
    Inputs:
        df (pd.DataFrame): The Pandas df
        cols_to_convert (list of strings): The columns to convert    
    Output:
        df - the updated dataframe
    '''
    for col in cols_to_convert:
        df[col]=df[col].astype("category")

    return df


In [20]:
#Set up test data:
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")
#data.head(5)
data["was_arrested"]=data["Arrest"].astype("float")
data = data.drop("Arrest", axis = 1)
data = convert_to_categorical(data, ["Beat", "Month", "Watch"])
#data.head(5)

data_small = data.sample(frac=0.0001)


In [13]:
#data_small.dtypes

In [24]:
#Test the code! 
data_list  = prep_data(data_small, "was_arrested", 2, "Year", 
                                       ["Year"])
data_list[0][2]


               ID  Domestic  Beat  Year Month  Week  Day  Hour   Watch  PRCP  \
853003   11260235     False   833  2018     3    11   18    23   Third  0.00   
1292781  11880291     False   421  2019    11    44    3     1   First  0.00   
1191542  11732161     False  1523  2019     6    25   23     1   First  0.24   
1174780  11707347     False   231  2019     6    22    1    11  Second  0.21   
1287166  11872286     False   833  2019    10    43   25    15  Second  0.00   
1096500  11590672     False  1722  2019     2     6    9     9  Second  0.00   
833448   11233986     False  1812  2018     2     7   17    20   Third  0.08   
1037501  11505974      True  1624  2018    11    46   13    11  Second  0.00   
1197856  11741570     False  1523  2019     6    26   30    21   Third  0.31   
841974   11249028     False   331  2018     3     9    2    17   Third  0.00   
990326   11449552     False  1832  2018     9    37   10    19   Third  0.00   
1016326  11478135      True  1135  2018 

Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

2019

In [15]:
#data_list[0][1]

In [16]:
def split_X_y(data, y):
    '''
    Function that separates predictors from target
    Inputs:
        data (DF): dataframe
        y (string): the name of the data column
    Outputs:
        X_ - df with predictor data
        y_ - df with target data
    '''
    y_ = data[y]
    X_ = data.drop(y, axis=1)
    
    return X_, y_ 

In [17]:
#X_train, y_train = split_x_y(data, "Arrest")
#y_train