## Run Grid Search on Machine Learning Algorithms

In [1]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
import datetime



In [2]:
%run ml_helpers.ipynb

               ID  Domestic  Beat  Year Month  Week  Day  Hour   Watch  PRCP  \
1043572  11516480      True   214  2018    11    47   22    10  Second  0.00   
876098   11293228     False  2023  2018     4    16   21    12  Second  0.00   
863567   11274692     False  1233  2018     4    14    3    12  Second  0.13   
1197848  11741531     False  2234  2019     6    26   30    21   Third  0.31   
885298   11306716     False  1512  2018     5    18    3    15  Second  0.20   
865080   11277050     False  1934  2018     4    14    5    17   Third  0.10   
886380   11307304     False   511  2018     5    18    4    23   Third  0.00   
906196   11331099     False  1024  2018     5    22   30     5   First  0.65   
1135195  11648738     False  1122  2019     4    15    8    18   Third  0.00   
1148539  11668949     False  1613  2019     4    17   27    18   Third  1.09   
1243215  11827082     False   824  2019     8    35   26    19   Third  0.56   
1073010  11554822     False   712  2019 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"]=df["Year"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

In [3]:
#Import bucketized data

#Run convert_to_categorical
#data = ml_helpers.convert_to_categorical(data, [columns_to_convert])

#Run prep_data(df, y, num_years, year_col, vars_to_onehot):
#data_list = ....

#If bucketized data doesn't have spatial lag, make it have spatial lag

In [4]:
def search_best_results(X_train, y_train, X_test, y_test, models, grid):
    '''
    Returns accuracy measures for model with different hyperparameters 
    '''
    results = pd.DataFrame(columns = ["parameters", "precision"])
    
    for model_key in models.keys():         
        # Loop over parameters 
        for params in grid[model_key]: 
            
            # Create model 
            model = models[model_key]
            model.set_params(**params)
            
            # Fit model on training set 
            model.fit(X_train, y_train)
            
            # Predict on testing set 
            y_pred = model.predict(X_test)
            
            # Evaluate predictions 
            precision = sk.metrics.precision_score(y_test, y_pred),
            precision=precision[0]
            # Store results in your results data frame 
            results2 = pd.DataFrame([[params, precision]], 
                                    columns = ["parameters", 
                                                "precision"])
            results = results.append(results2)
    return results

In [1]:
def evaluate_results(X_train, y_train, X_test, y_test, models, grid):
    '''
    Returns precision and recall for model 
    This function should be used only for final data frame 
    '''
    results = pd.DataFrame(columns = ["parameters", "precision", "recall"])
    
    for model_key in models.keys():         
        # Loop over parameters 
        for params in grid[model_key]: 
            
            # Create model 
            model = models[model_key]
            model.set_params(**params)
            
            # Fit model on training set 
            model.fit(X_train, y_train)
            
            # Predict on testing set 
            y_pred = model.predict(X_test)
            
            # Evaluate predictions 
            precision = sk.metrics.precision_score(y_test, y_pred)
            recall = sk.metrics.recall_score(y_test, y_pred)
            precision=precision[0]
            recall = recall[0]
            # Store results in your results data frame 
            results2 = pd.DataFrame([[params, precision]], 
                                    columns = ["parameters", 
                                                "precision", "recall"])
            results = results.append(results2)
    return results

In [5]:
def average_grid_searches(model, model_name, 
                          grid, 
                          data_list, y, test_year):
    '''
    Model: sklearn model object
    model_name (string): a name for the model
    grid (dict): maps hyperparameters to potential options:
    scoring (list of strings): a list of the ways to score
    data (pd.DataFrame): a pandas dataframe with all the data
    test_year (int): The year we seek to predict
    
    Idea to concatenate rows of dfs came via the first answer on from this stack exchange: 
    https://stackoverflow.com/questions/44515888/compute-average-mean-across-dataframes-in-python-pandas
    
    Get mean of col rows help from here: 
    https://stackoverflow.com/questions/33750326/compute-row-average-in-pandas
    
    grid example: {model_name: [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.01, 0.1, 1, 10, 100)]}
    
    '''
    print("test year is:", test_year)
    target_years_in_train = list(np.arange(2015,test_year))
    target_years_in_train.pop(0)
    print(target_years_in_train)
    results_dfs = []
    
    for data_set in data_list:
        train_df = data_set[0]
        test_df = data_set[1]
        year = data_set[2]
        X_train, y_train = split_X_y(train_df, y)
        X_test, y_test = split_X_y(test_df, y)
        
        
        models = {model_name: model}
        
        results_df = search_best_results(X_train, y_train, X_test, y_test, models, grid)
        results_df = results_df.add_suffix("_" + str(year))
        results_dfs.append(results_df)
    
    all_results = pd.concat(results_dfs, axis = 1)
    all_results["mean"] = all_results.mean(axis=1, numeric_only=True)
    #all_results = all_results[["params_" + str(test_year-1), "mean"]]
    all_results.sort_values("mean", inplace=True, ascending=False)
    #results_to_return = all_results["params", 'mean'].sort_values("mean")
    #return results_to_return

    #log_reg_results["mean"]=log_reg_results.mean(axis=1, numeric_only=True)


    
    return all_results

In [2]:
def results_by_year(model, model_name, 
                          grid, 
                          data_list, y, test_year):
    '''
    Model: sklearn model object
    model_name (string): a name for the model
    grid (dict): maps hyperparameters to potential options:
    scoring (list of strings): a list of the ways to score
    data (pd.DataFrame): a pandas dataframe with all the data
    test_year (int): The year we seek to predict
    
    Idea to concatenate rows of dfs came via the first answer on from this stack exchange: 
    https://stackoverflow.com/questions/44515888/compute-average-mean-across-dataframes-in-python-pandas
    
    Get mean of col rows help from here: 
    https://stackoverflow.com/questions/33750326/compute-row-average-in-pandas
    
    grid example: {model_name: [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.01, 0.1, 1, 10, 100)]}
    
    '''
    print("test year is:", test_year)
    target_years_in_train = list(np.arange(2015,test_year))
    target_years_in_train.pop(0)
    print(target_years_in_train)
    results_dfs = []
    
    for data_set in data_list:
        train_df = data_set[0]
        test_df = data_set[1]
        year = data_set[2]
        X_train, y_train = split_X_y(train_df, y)
        X_test, y_test = split_X_y(test_df, y)
        
        
        models = {model_name: model}
        
        results_df = evaluate_results(X_train, y_train, X_test, y_test, models, grid)
        results_df = results_df.add_suffix("_" + str(year))
        results_dfs.append(results_df)
    
    all_results = pd.concat(results_dfs, axis = 1)

    return all_results

### Test the code:

In [6]:
#Get data set up
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")
data["was_arrested"]=data["Arrest"].astype("float")
data = data.drop(["Arrest", "category_1", "category_2", "Domestic", "ID", "Week", "Day", "Hour"], axis = 1)
data = convert_to_categorical(data, ["Beat"])
data_small = data.sample(frac=0.001)

data_small_ready = prep_data(data_small, "was_arrested",
                                        1, "Year", ["Year", "Month", "Beat", "Watch"])

         Beat  Year  Month   Watch  PRCP  SNOW  TMAX  TMIN  count_l_stops  \
1186645   313  2019      6   Third  0.12   0.0    67    56            0.0   
1275009  1524  2019     10  Second  0.00   0.0    71    48            0.0   
1118146   232  2019      3   Third  0.49   0.0    64    40            0.0   
1224565   934  2019      8   First  0.00   0.0    89    64            0.0   
1128828  1132  2019      3   Third  0.40   0.0    44    31            1.0   
...       ...   ...    ...     ...   ...   ...   ...   ...            ...   
1279177   312  2019     10  Second  0.00   0.0    55    39            2.0   
1287425   834  2019     10   Third  0.00   0.0    50    41            0.0   
1077642   831  2019      1  Second  0.00   0.0    26    17            0.0   
1111012   611  2019      3  Second  0.00   0.0    28     6            0.0   
1298153   921  2019     11   First  0.27   2.9    37    14            0.0   

         count_bus_stops  ...  count_restaurants  count_bars  count_daycare

Finished one-hot encoding...
Finished standardizing...
Working on: [2015]
Have accessed train and test df...
Finished filling NAs with mean...
On to normalizing continuous...
Finished normalizing training data
Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


In [7]:
train = data_small_ready[0][0]
test = data_small_ready[0][1]
train_X, train_y = split_X_y(train, "was_arrested")
test_X, test_y = split_X_y(test, "was_arrested")
models = {
    'LogisticRegression': LogisticRegression(), 
}

grid = {
    'LogisticRegression': [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.01, 0.1, 1, 10, 100)]
}
res = search_best_results(train_X, train_y, test_X, test_y, models, grid)
res

Unnamed: 0,parameters,precision
0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.0
0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.0
0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.5
0,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.333333
0,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.298507
0,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.275
0,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.275
0,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.275
0,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.275
0,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.275
