## Run Grid Search on Machine Learning Algorithms

In [80]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
import datetime

In [37]:
%run ml_helpers.ipynb

               ID  Domestic  Beat  Year Month  Week  Day  Hour   Watch  PRCP  \
1080332  11567335     False   434  2019     1     2   13    15  Second  0.02   
908920   11371006     False   414  2018     6    22    2     6   First  0.32   
1031434  11497552     False   421  2018    11    44    4    12  Second  0.59   
981640   11431660     False   424  2018     8    35   30    19   Third  0.00   
1140544  11658653      True   124  2019     4    16   16    12  Second  0.00   
893810   11316372     False   532  2018     5    20   14    17   Third  1.48   
807857   11198797      True  1123  2018     1     2    8    20   Third  0.00   
884319   11304770     False   512  2018     5    18    2    11  Second  0.22   
859973   11271200     False   834  2018     3    13   29     4   First  0.07   
997510   11505905     False  1612  2018     9    38   20     9  Second  0.00   
994139   11448043     False  2424  2018     9    37   15    22   Third  0.00   
1005064  11462213     False  1013  2018 

Finished normalizing test data...
Finished one-hot encoding...
Finished one-hot encoding...
Finished standardizing...


In [3]:
#Import bucketized data

#Run convert_to_categorical
#data = ml_helpers.convert_to_categorical(data, [columns_to_convert])

#Run prep_data(df, y, num_years, year_col, vars_to_onehot):
#data_list = ....

#If bucketized data doesn't have spatial lag, make it have spatial lag

In [85]:
def grid_search_results(model_pipe, model_name, grid_params, scoring, X_train, y_train):
    '''
    Runs gridsearch for data on a given year to find best hyperparameters
    Inputs:
        model_pipe: sklearn Pipeline object with a model and name
        grid_params (dict): maps hyperparameters to potential options
        scoring (list of strings): a list of the ways to score
    Ouputs:
        results (pd.DataFrame): the cross-validated scores for each model
    '''
    gridsearch = GridSearchCV(model_pipe, 
                          grid_params, 
                          scoring = scoring, 
                          refit = scoring[0],    
                          cv=10, 
                          n_jobs=-1)
    fit = gridsearch.fit(X_train, y_train)
    fit.
    results = pd.DataFrame(fit.cv_results_)
    results["precision"] = precision_score(y_train, results)
    
    #results = results[["params", "mean_test_precision"]]
    return results

In [198]:
def search_best_results(X_train, y_train, X_test, y_test, models, grid):
    '''
    Returns accuracy measures for model with different hyperparameters 
    '''
    results = pd.DataFrame(columns = ["parameters", "precision"])
    counter = 0
    
    for model_key in models.keys():         
        # Loop over parameters 
        for params in grid[model_key]: 
            counter+=1
            
            # Create model 
            model = models[model_key]
            model.set_params(**params)
            
            # Fit model on training set 
            model.fit(X_train, y_train)
            
            # Predict on testing set 
            y_pred = model.predict(X_test)
            
            # Evaluate predictions 
            precision = sk.metrics.precision_score(y_test, y_pred),
            precision=precision[0]
            # Store results in your results data frame 
            results2 = pd.DataFrame([[params, precision]], 
                                    columns = ["parameters", 
                                                "precision"])
            results = results.append(results2)
    return results

In [195]:
train = data_small_ready[0][0]
test = data_small_ready[0][1]
train_X, train_y = split_X_y(train, "was_arrested")
test_X, test_y = split_X_y(test, "was_arrested")
models = {
    'LogisticRegression': LogisticRegression(), 
}

grid = {
    'LogisticRegression': [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.01, 0.1, 1, 10, 100)]
}

In [199]:
res = search_best_results(train_X, train_y, test_X, test_y, MODELS, GRID)

In [200]:
res

Unnamed: 0,parameters,precision
0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.0
0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.0
0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.6
0,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.3
0,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.25
0,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.241379
0,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.241379
0,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.241379
0,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.241379
0,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.241379


In [208]:
def average_grid_searches(model, model_name, 
                          grid, 
                          data_list, y, test_year):
    '''
    Model: sklearn model object
    model_name (string): a name for the model
    grid (dict): maps hyperparameters to potential options:
    scoring (list of strings): a list of the ways to score
    data (pd.DataFrame): a pandas dataframe with all the data
    test_year (int): The year we seek to predict
    
    Idea to concatenate rows of dfs came via the first answer on from this stack exchange: 
    https://stackoverflow.com/questions/44515888/compute-average-mean-across-dataframes-in-python-pandas
    
    Get mean of col rows help from here: 
    https://stackoverflow.com/questions/33750326/compute-row-average-in-pandas
    
    grid example: {model_name: [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.01, 0.1, 1, 10, 100)]}
    
    '''
    print("test year is:", test_year)
    target_years_in_train = list(np.arange(2015,test_year))
    target_years_in_train.pop(0)
    print(target_years_in_train)
    results_dfs = []
    
    for data_set in data_list:
        train_df = data_set[0]
        test_df = data_set[1]
        year = data_set[2]
        X_train, y_train = split_X_y(train_df, y)
        X_test, y_test = split_X_y(test_df, y)
        
        
        models = {model_name: model}
        
        results_df = search_best_results(X_train, y_train, X_test, y_test, models, grid)
        results_df = results_df.add_suffix("_" + str(year))
        results_dfs.append(results_df)
    
    all_results = pd.concat(results_dfs, axis = 1)
    all_results["mean"] = all_results.mean(axis=1, numeric_only=True)
    #all_results = all_results[["params_" + str(test_year-1), "mean"]]
    all_results.sort_values("mean", inplace=True, ascending=False)
    #results_to_return = all_results["params", 'mean'].sort_values("mean")
    #return results_to_return

    log_reg_results["mean"]=log_reg_results.mean(axis=1, numeric_only=True)


    
    return all_results
    

### Test the code:

In [89]:
#Get data set up
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("../intermediate_data/df_2015_to_present.csv")
data["was_arrested"]=data["Arrest"].astype("float")
data = data.drop(["Arrest", "category_1", "category_2", "Domestic", "ID", "Week", "Day", "Hour"], axis = 1)
data = convert_to_categorical(data, ["Beat"])
data_small = data.sample(frac=0.001)

data_small_ready = prep_data(data_small, "was_arrested",
                                        1, "Year", ["Year", "Month", "Beat", "Watch"])

         Beat  Year  Month   Watch  PRCP  SNOW  TMAX  TMIN  count_l_stops  \
1101838  2534  2019      2   First  0.13   2.1    29    25            0.0   
1235670  1221  2019      8   Third  0.00   0.0    85    65            0.0   
1129168  1511  2019      3   First  0.00   0.0    40    24            0.0   
1081262  1024  2019      1   Third  0.00   0.0    28    21            1.0   
1330131  2422  2019     12   First  0.85   0.0    58    48            1.0   
...       ...   ...    ...     ...   ...   ...   ...   ...            ...   
1082724  1821  2019      1   First  0.00   0.0    36    29            1.0   
1161321  1132  2019      5  Second  0.00   0.0    77    52            1.0   
1176965  1432  2019      6  Second  0.12   0.0    83    60            0.0   
1200226   412  2019      7   Third  0.38   0.0    86    71            0.0   
1331711  2521  2019     12   Third  0.06   0.9    30    26            0.0   

         count_bus_stops  ...  count_restaurants  count_bars  count_daycare

        Beat  Year  Month   Watch  PRCP  SNOW  TMAX  TMIN  count_l_stops  \
457706  2433  2016      9  Second  0.02   0.0    83    67            2.0   
512138  1824  2016     11   Third  1.08   0.0    53    42            1.0   
344040  1115  2016      4  Second  0.00   0.0    72    44            0.0   
505805  1032  2016     11  Second  0.00   0.0    40    29            0.0   
347632   915  2016      5  Second  0.47   0.0    51    44            1.0   
...      ...   ...    ...     ...   ...   ...   ...   ...            ...   
397219   712  2016      7   Third  0.00   0.0    78    62            0.0   
432923  2023  2016      8  Second  0.00   0.0    88    71            2.0   
351900  1732  2016      5  Second  0.00   0.0    71    49            2.0   
417877  1032  2016      7   Third  1.08   0.0    82    70            0.0   
311166   931  2016      3   First  0.00   0.0    53    37            0.0   

        count_bus_stops  ...  count_restaurants  count_bars  count_daycares  \
457706  

In [210]:
#Looks like it is working
grid = {
    'LogisticRegression': [{'penalty': x, 'C': y, 'random_state': 0} 
                           for x in ('l2', 'none') \
                           for y in (0.01, 0.1, 1, 10, 100)]
}
results = average_grid_searches(LogisticRegression(), "LogisticRegression", 
                          grid, 
                          data_small_ready, "was_arrested", 2020)

results

test year is: 2020
[2016, 2017, 2018, 2019]


Unnamed: 0,parameters_2019,precision_2019,parameters_2018,precision_2018,parameters_2017,precision_2017,parameters_2016,precision_2016,mean
0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.6,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 1, 'random_state': 0}",0.333333,0.233333
0,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.3,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.074074,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.25641,"{'penalty': 'l2', 'C': 10, 'random_state': 0}",0.21875,0.212309
0,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.25,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.203704,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.217391,"{'penalty': 'l2', 'C': 100, 'random_state': 0}",0.16,0.207774
0,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.241379,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.206897,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.216495,"{'penalty': 'none', 'C': 0.01, 'random_state': 0}",0.152542,0.204328
0,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.241379,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.206897,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.216495,"{'penalty': 'none', 'C': 0.1, 'random_state': 0}",0.152542,0.204328
0,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.241379,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.206897,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.216495,"{'penalty': 'none', 'C': 1, 'random_state': 0}",0.152542,0.204328
0,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.241379,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.206897,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.216495,"{'penalty': 'none', 'C': 10, 'random_state': 0}",0.152542,0.204328
0,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.241379,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.206897,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.216495,"{'penalty': 'none', 'C': 100, 'random_state': 0}",0.152542,0.204328
0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0}",0.0,0.0
0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.0,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0}",0.0,0.0
