## Run Grid Search on Machine Learning Algorithms

In [None]:
import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
import sklearn as sk
from sklearn import preprocessing
import datetime

In [7]:
%run ml_helpers.ipynb

In [None]:
#Import bucketized data:
#data = ml_helpers.convert_to_categorical(data, [columns_to_convert])
#If bucketized data doesn't have spatial lag, make it have spatial lag

In [None]:
def grid_search_results(model, model_name, grid_params, scoring, X_train, y_train):
    '''
    Runs gridsearch for data on a given year to find best hyperparameters
    Inputs:
        Model: sklearn model object
        grid_params (dict): maps hyperparameters to potential options
        scoring (list of strings): a list of the ways to score
    Ouputs:
        results (pd.DataFrame): the cross-validated scores for each model
    '''
    gridsearch = GridSearchCV(model, 
                          grid_params, 
                          scoring = scoring, 
                          cv=10, 
                          n_jobs=-1)
    fit = gridsearch.fit(X_train, y_train)
    results = pd.DataFrame(fit.cv_results_)
    results = results[["params", "mean_test_f1"]]
    

In [58]:
def average_grid_searches(model, model_name, 
                          grid_params, scoring, 
                          data, test_year, 
                          num_years_in_train, vars_to_onehot):
    '''
    Model: sklearn model object
    model_name (string): a name for the model
    grid_params (dict): maps hyperparameters to potential options
    scoring (list of strings): a list of the ways to score
    data (pd.DataFrame): a pandas dataframe with all the data
    test_year (int): The year we seek to predict
    num_years_in_train (int): The number of years before the test year to 
        use to predict the test year
    vars_to_onehot (list of strings): A list of strings of variables to onehot encode
    
    Idea to concatenate rows of dfs came via the first answer on from this stack exchange: 
    https://stackoverflow.com/questions/44515888/compute-average-mean-across-dataframes-in-python-pandas
    
    Get mean of col rows help from here: 
    https://stackoverflow.com/questions/33750326/compute-row-average-in-pandas
    '''
    target_years_in_train = list(np.arange(2015,test_year))
    target_years_in_train.pop(0)
    
    results_dfs = []
    
    for target_year in target_years_in_train:
        train_df, train_y, test_df, test_y = prep_data(df, y, 
                                                       taget_year, num_years_in_train, 
                                                       vars_to_onehot)
        results_df = grid_search_results(model, model_name, 
                                         grid_params, scoring, 
                                         X_train, y_train)
        results_dfs.append(results_df)
    
    all_results = pd.concat(results_dfs, axis = 1)
    all_results['mean'] = all_results.mean(axis=1)
    all_results.sort_values("mean", inplace=True, ascending=False)
    results_to_return = all_results["params", 'mean'].sort_values("mean")
    return results_to_return

    

In [56]:
#Test that most grid_search code works:
d = pd.DataFrame({'col1': [1, 2]})
e = pd.DataFrame({"col1": [3, 4]})
f = pd.DataFrame({"col1":[5, 6]})

all = pd.concat([d, e, f], axis=1)
all['mean'] = all.mean(axis=1)
all.sort_values("mean", inplace=True, ascending=False)
results_to_return = all['mean']


results_to_return

1    4.0
0    3.0
Name: mean, dtype: float64