In [1]:
#Helpers to Run ML Algorithms

In [2]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime
from sklearn.decomposition import PCA



In [35]:
def fill_missing_mean(df_train, df_to_fill):
    '''
    A simple function that fills missing values of continuous columns 
        with the column median
    Inputs:
        df_train (df): the training df. Function computes means from this value AND fills this 
        df_to_fill (df): the df whose continuous NAs should be filled 
    Returns:
        df_train (df): the training dataset
        df_to_fill: the testing dataset with its data filled by the training data median

    '''
    df_train_num = df_train.select_dtypes(include=[np.number])
    #means = df_train_num.mean().to_dict()
    mean_dict = {}
    
    
    
    for col in df_train_num.columns:
        mean_dict[col] = df_train[col].mean()
    
    df_train.fillna(value=mean_dict, inplace=True)
    df_to_fill.fillna(value=mean_dict, inplace=True)
    
    #df_train = df_train.fillna(value=means)
    #df_to_fill = df_to_fill.fillna(value=means)
    print("Finished filling NAs with mean...")
    return df_train, df_to_fill

In [4]:
#Testing above function

#df_train = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[1, 2, 3, 4, 5, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_to_fill = pd.DataFrame({'value': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], "value2":[10, 20, 33, 43, 53, np.nan, 7, 8, 9], 'Year': ['A','A', 'B','B','B','B', 'C','C','C']})
#df_train, df_to_fill = fill_missing_mean(df_train, df_to_fill)
#df_to_fill
#df_train

In [36]:
def normalize_continuous(df, scaler = None):
    '''
    A simple function that normalizes the values of of continuous columns 
        using data from the training set
    Inputs:
        df (df): either the training or the testing df
        scaler: the scaler object. It will be None for training and exist for testing 
    Returns:
        df (df): the standardized df
        scaler: the scaler object
    '''
    df["Year"]=df["Year"].astype("category")
    if scaler is None: #Training case
        scaler = sk.preprocessing.StandardScaler() #Set up scaler
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.fit_transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing training data")
    else: #Testing case
        df_num = df.select_dtypes(include=[np.number]) #find numeric columns
        df_num_scaled = scaler.transform(df_num) #Normalize them
        df_num_cols = list(df_num.columns) 
        df.loc[:, df_num_cols] = df_num_scaled #Insert columns back into the main df 
        print("Finished normalizing test data...")
    return df, scaler

In [37]:
def one_hot_encode(df, cat_vars): 
    '''
    A function to one-hot encode given categorical variables
    Inputs:
        df (df): a pandas dataframe
        cat_vars (list of strings): a list of the categorical variables to one-hot encode
    '''

    df = pd.get_dummies(df, columns = cat_vars)
    print("Finished one-hot encoding...")
    return df

In [38]:
def standardize_columns(train, test):
    '''
    A function to ensure that training and testing data have identical columns
    after one-hot encoding
    If a column is in training but not testing, adds a column of 0s to testing
    If column is in testing but not training, it is removed
    Inputs:
        train (df): the training df
        test (df): the testing df
    Outputs:
        train, test (df): the datasets with identical columns
    '''
    train_cols = list(train.columns)
    test_cols = list(test.columns)
    
    for tr_col in train_cols:
        if tr_col not in test_cols:
            test[tr_col] = 0
    
    for test_col in test_cols:
        if test_col not in train_cols:
            test = test.drop(test_col, axis=1)
    print("Finished standardizing...")
    return (train, test)

In [39]:
def split_train_test_by_year(df, y, num_years, year_col):
    '''
    
    Ultimately, this function divides the dataset up into smaller chunks with a one year test set and a num_years 
        years worth of training data in the num_years years just previous to the test year
        
    Ex: we are always predicting 2020 but remove that to see results later.
        If num_years = 2 and the data runs from 2015 to 2020, this function creates data with:
            
            set 1:
                Train: 2015 and 2016
                Test: 2017
            set 2:
                Train: 2016 2017
                Test: 2018
            set 3:
                Train: 2017 and 2018
                Test: 2019
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        num_years(int): the number of years to be included in the training set
        year_col (str): the name of the column representing the years in the df
    Output:
   train_test_data_list (list of tuples):
       each tuple contains:
           df_train (DataFrame): includes num_years worth of data before the test year
           df_test (DataFrame): includes 1 year, the test year for this set of data
    '''
    year_list = df[year_col].unique()
    year_list = sorted(list(year_list), reverse=True)
    year_list

    train_test_data_list = []
    for year in year_list:
        if year - num_years in year_list:
            df_test = df.loc[df[year_col]==year]
            df_train = df.loc[(df[year_col] < year) & (df[year_col] >= year-num_years)]
            train_test_data_list.append((df_train, df_test, year))
            print(train_test_data_list[-1][0])
    train_test_data_list.pop(0)
    print("Finished splitting...")
    return train_test_data_list


In [40]:
def split_train_test_by_year_test(df, y, num_years, year_col):
    '''
    
    Ultimately, this function divides the dataset up into smaller chunks with a one year test set and a num_years 
        years worth of training data in the num_years years just previous to the test year
        
    Ex: we are always predicting 2020 but remove that to see results later.
        If num_years = 2 and the data runs from 2015 to 2020, this function creates data with:
            
            set 1:
                Train: 2015 and 2016
                Test: 2017
            set 2:
                Train: 2016 2017
                Test: 2018
            set 3:
                Train: 2017 and 2018
                Test: 2019
    Inputs:
        df (df): the dataframe with both the X and y 
        y (string): the column name in the df that is the target
        num_years(int): the number of years to be included in the training set
        year_col (str): the name of the column representing the years in the df
    Output:
   train_test_data_list (list of tuples):
       each tuple contains:
           df_train (DataFrame): includes num_years worth of data before the test year
           df_test (DataFrame): includes 1 year, the test year for this set of data
    '''
    year_list = df[year_col].unique()
    year_list = sorted(list(year_list), reverse=True)
    year_list

    train_test_data_list = []
    for year in year_list:
        if year - num_years in year_list:
            df_test = df.loc[df[year_col]==year]
            df_train = df.loc[(df[year_col] < year) & (df[year_col] >= year-num_years)]
            train_test_data_list.append((df_train, df_test, year))
            print(train_test_data_list[-1][0])
    print("Finished splitting...")
    return train_test_data_list

In [10]:
#Testing above;
#data_list = split_train_test_by_year(data, "was_arrested", 2, "Year")   
#for group in data_list:
#    print(group[0]["Year"].unique(), group[1]["Year"].unique())



In [41]:
def prep_data(df, y, num_years, year_col, vars_to_onehot, pca=None, columns_to_pca=None):
    '''
    Helper function that aggregates the above helpers to prepare for imputation in 
    an ML algorithm. Specifically this:
        Splits the training set and testing set based on year 
            using split_train_test_by_year
        One-hot encodes and standardizes the columns using 
            one_hot_encode and standardize_column
        Normalizes all continuous variables using normalize_continuous
    Inputs:
    df (pandas DataFrame): the dataframe with training and testing data, predictors and target
    y (string): the name of the target column
    num_years (int): the number of years to be included in the training set
    test_year (int): the year we seek to predict 2015_2020
    
    Outputs:
    cleaned_train_test - a list of tuples. 
    The first tuple is the training dataframe and the second is the test for a given set of years
    '''
    df = convert_to_categorical(df, [y])
    cleaned_trained_test = []
    train_test_list = split_train_test_by_year_test(df, y, num_years, year_col)
    for year_set in train_test_list:
        print("Working on:", year_set[0]["Year"].unique())
        train_df = year_set[0]
        test_df = year_set[1]
        year = year_set[2]
        print("Have accessed train and test df...")
        train_df, test_df = fill_missing_mean(train_df, test_df)
        print("On to normalizing continuous...")
        train_df, scaler = normalize_continuous(train_df)
        test_df, doesnt_matter = normalize_continuous(test_df, scaler)
        train_df = one_hot_encode(train_df, vars_to_onehot)
        test_df = one_hot_encode(test_df, vars_to_onehot)
        train_df, test_df = standardize_columns(train_df, test_df)
        
        if pca is not None:
            train_df, test_df = conduct_pca(train_df, test_df, columns_to_pca)
        
        cleaned_trained_test.append((train_df, test_df, year))
        
    return cleaned_trained_test
    

In [42]:
def prep_data_test(df, y, num_years, year_col, vars_to_onehot):
    '''
    This function is identical to the above except that it does not remove the last set of data
    Thus, it should be used when computing final results of the best models
    '''
    df = convert_to_categorical(df, [y])
    cleaned_trained_test = []
    train_test_list = split_train_test_by_year_test(df, y, num_years, year_col)
    for year_set in train_test_list:
        print("Working on:", year_set[0]["Year"].unique())
        train_df = year_set[0]
        test_df = year_set[1]
        year = year_set[2]
        print("Have accessed train and test df...")
        train_df, test_df = fill_missing_mean(train_df, test_df)
        print("On to normalizing continuous...")
        train_df, scaler = normalize_continuous(train_df)
        test_df, doesnt_matter = normalize_continuous(test_df, scaler)
        train_df = one_hot_encode(train_df, vars_to_onehot)
        test_df = one_hot_encode(test_df, vars_to_onehot)
        train_df, test_df = standardize_columns(train_df, test_df)
        
        cleaned_trained_test.append((train_df, test_df, year))
        
    return cleaned_trained_test

In [73]:
def conduct_pca(train_df, test_df, columns_to_pca):
    '''
    Function that performs pca on a set of specified columns
    Inputs:
        train_df (df): training dataframe
        test_df (df): testing dataframe
        columns_to_pca (list of strings): list of column names to perform pca
                                          Note that these should not also be one-hotencoded
        num_feature (int): Number of components to return
    Outputs:
        train_df(df): df with all of the columns in columns_to_pca combined to 2 cols
        test_df(df):  df with all of the columns in columns_to_pca combined to 2 cols
    '''
    pca = PCA(n_components=2)
    pca.fit(train_df[columns_to_pca])
    
    train_cols = pca.transform(train_df[columns_to_pca])
    train_pca = pd.DataFrame(data=train_cols,
                  columns = ["principal component 1",
                             "principal component 2"])
    train_df.drop(columns = columns_to_pca, inplace=True)
    
    print(train_pca["principal component 1"].head(5))
    print(train_pca.shape, train_df.shape)
    
    
    train_df["PC 1"] = train_pca["principal component 1"].values
    train_df["PC 2"] = train_pca["principal component 2"].values


    
    test_cols = pca.transform(test_df[columns_to_pca])
    test_pca = pd.DataFrame(data=test_cols,
                  columns = ["principal component 1",
                             "principal component 2"])
    test_df.drop(columns = columns_to_pca, inplace=True)
    test_df["PC 1"] = test_pca["principal component 1"].values
    test_df["PC 2"] = test_pca["principal component 2"].values
    
    
    return train_df, test_df

In [44]:
def convert_to_categorical(df, cols_to_convert):
    '''
    Convert columns to categorical
    Inputs:
        df (pd.DataFrame): The Pandas df
        cols_to_convert (list of strings): The columns to convert    
    Output:
        df - the updated dataframe
    '''
    for col in cols_to_convert:
        df[col]=df[col].astype("category")

    return df


In [45]:
#Set up test data:
data = pd.read_csv("../intermediate_data/high_crime_labeled.csv")
#data.head(5)
#data["was_arrested"]=data["Arrest"].astype("float")
#data = data.drop("Arrest", axis = 1)
data = convert_to_categorical(data, ["Beat", "Month", "Watch"])
#data.head(5)

#data_small = data.sample(frac=0.0001)
data

Unnamed: 0.1,Unnamed: 0,beat,beat_num,district,sector,Year,Month,Watch,Beat,Crimes,...,count_restaurants,count_bars,count_daycares,count_entertainment,count_businesses,road_distance_ft,TOTAL POPULATION,dist_to_police,dist_to_hospital,high_crime_geog_lag
0,0,1,1713,17,1,2015,1,Third,1713,16,...,63.0,9.0,6.0,1.0,92.0,169276.405792,13283.675264,5454.068890,3258.006066,0.155378
1,1,1,1713,17,1,2015,1,Second,1713,20,...,63.0,9.0,6.0,1.0,92.0,169276.405792,13283.675264,5454.068890,3258.006066,0.155378
2,2,1,1713,17,1,2015,1,First,1713,14,...,63.0,9.0,6.0,1.0,92.0,169276.405792,13283.675264,5454.068890,3258.006066,0.155378
3,3,1,1713,17,1,2015,2,Second,1713,12,...,63.0,9.0,6.0,1.0,92.0,169276.405792,13283.675264,5454.068890,3258.006066,0.155378
4,4,1,1713,17,1,2015,2,Third,1713,12,...,63.0,9.0,6.0,1.0,92.0,169276.405792,13283.675264,5454.068890,3258.006066,0.155378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59088,59088,1,312,3,1,2020,11,Third,312,35,...,14.0,1.0,3.0,1.0,38.0,200633.844449,4144.689546,4843.037238,4374.482421,0.181185
59089,59089,1,312,3,1,2020,11,First,312,16,...,14.0,1.0,3.0,1.0,38.0,200633.844449,4144.689546,4843.037238,4374.482421,0.181185
59090,59090,1,312,3,1,2020,12,Third,312,29,...,14.0,1.0,3.0,1.0,38.0,200633.844449,4144.689546,4843.037238,4374.482421,0.181185
59091,59091,1,312,3,1,2020,12,First,312,19,...,14.0,1.0,3.0,1.0,38.0,200633.844449,4144.689546,4843.037238,4374.482421,0.181185


In [32]:
data.dtypes

Unnamed: 0                int64
ID                        int64
Arrest                     bool
Domestic                   bool
Beat                   category
Year                      int64
Month                  category
Week                      int64
Day                       int64
Hour                      int64
Watch                  category
PRCP                    float64
SNOW                    float64
TMAX                      int64
TMIN                      int64
category_1               object
category_2               object
count_l_stops           float64
count_bus_stops         float64
count_metra_stops       float64
count_restaurants       float64
count_bars              float64
count_daycares          float64
count_entertainment     float64
count_businesses        float64
road_distance_ft        float64
TOTAL POPULATION        float64
dist_to_police          float64
dist_to_hospital        float64
dtype: object

In [82]:
#Test the code! 
#data_list  = prep_data(data, "high_crime", 2, "Year",  ["Year", "beat_num", "district", "sector", "Month"], pca="conduct", 
#                      columns_to_pca = ["TOTAL POPULATION", 
#                       "dist_to_police", "dist_to_hospital", "count_l_stops", "count_bus_stops", 
#                                        "count_metra_stops", "count_restaurants", "count_bars", 
#                                        "count_daycares", "count_entertainment", "count_businesses", 
#                                        "road_distance_ft"])


       Unnamed: 0  beat  beat_num  district  sector  Year Month   Watch  Beat  \
29557         108     1      1713        17       1  2018     1   First  1713   
29558         109     1      1713        17       1  2018     1  Second  1713   
29559         110     1      1713        17       1  2018     1   Third  1713   
29560         111     1      1713        17       1  2018     2   Third  1713   
29561         112     1      1713        17       1  2018     2   First  1713   
...           ...   ...       ...       ...     ...   ...   ...     ...   ...   
49256       59052     1       312         3       1  2019    11  Second   312   
49257       59053     1       312         3       1  2019    11   Third   312   
49258       59054     1       312         3       1  2019    12  Second   312   
49259       59055     1       312         3       1  2019    12   First   312   
49260       59056     1       312         3       1  2019    12   Third   312   

       Crimes  ...  count_r

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.854452
1    0.854452
2    0.854452
3    0.854452
4    0.854452
Name: principal component 1, dtype: float64
(19704, 2) (19704, 328)
Working on: [2017 2018]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.852368
1    0.852368
2    0.852368
3    0.852368
4    0.852368
Name: principal component 1, dtype: float64
(19695, 2) (19695, 328)
Working on: [2016 2017]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.85269
1    0.85269
2    0.85269
3    0.85269
4    0.85269
Name: principal component 1, dtype: float64
(19696, 2) (19696, 328)
Working on: [2015 2016]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...
0    0.856576
1    0.856576
2    0.856576
3    0.856576
4    0.856576
Name: principal component 1, dtype: float64
(19712, 2) (19712, 328)


In [84]:
#data_list[0][0]

Unnamed: 0.1,Unnamed: 0,beat,Watch,Beat,Crimes,Serious,Arrest,Domestic,TMAX,TMIN,...,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,PC 1,PC 2
29557,-1.727009,-1.167287,First,1713,-1.145619,-1.254018,-0.732808,-0.644941,-1.376112,-1.389057,...,0,0,0,0,0,0,0,0,0.854452,-1.187052
29558,-1.726950,-1.167287,Second,1713,-0.659307,-0.620498,-0.412349,-0.112728,-1.376112,-1.389057,...,0,0,0,0,0,0,0,0,0.854452,-1.187052
29559,-1.726892,-1.167287,Third,1713,-0.172995,-0.348990,-0.252119,-0.112728,-1.376112,-1.389057,...,0,0,0,0,0,0,0,0,0.854452,-1.187052
29560,-1.726833,-1.167287,Third,1713,-0.659307,-0.892007,-0.252119,-0.378835,-1.034854,-1.191190,...,0,0,0,0,0,0,0,0,0.854452,-1.187052
29561,-1.726774,-1.167287,First,1713,-1.145619,-0.801504,-0.893037,-0.644941,-1.034854,-1.191190,...,0,0,0,0,0,0,0,0,0.854452,-1.187052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49256,1.727721,-1.167287,Second,312,0.981997,-0.167984,0.388799,1.750018,-0.829309,-0.790072,...,0,0,0,0,0,0,1,0,-0.800704,-1.360331
49257,1.727780,-1.167287,Third,312,0.070162,-0.167984,0.068340,0.951698,-0.829309,-0.790072,...,0,0,0,0,0,0,1,0,-0.800704,-1.360331
49258,1.727838,-1.167287,Second,312,0.374107,0.465536,-0.893037,0.951698,-0.886981,-0.889857,...,0,0,0,0,0,0,0,1,-0.800704,-1.360331
49259,1.727897,-1.167287,First,312,-0.476940,-0.711001,-0.252119,0.951698,-0.886981,-0.889857,...,0,0,0,0,0,0,0,1,-0.800704,-1.360331


In [18]:
def split_X_y(data, y):
    '''
    Function that separates predictors from target
    Inputs:
        data (DF): dataframe
        y (string): the name of the data column
    Outputs:
        X_ - df with predictor data
        y_ - df with target data
    '''
    y_ = data[y]
    X_ = data.drop(y, axis=1)
    
    return X_, y_ 

In [19]:
#X_train, y_train = split_x_y(data, "Arrest")
#y_train