## Optimizing Crime forecasting with HyperOpt - using weather data and additional regressors.

1. Importing libraries

In [None]:
import optuna
import pandas as pd
import numpy as np
from neuralprophet import NeuralProphet 
from joblib import parallel_backend
import random
from neuralprophet import utils
from neuralprophet import set_random_seed

set_random_seed(42)
random.seed(42)
np.random.seed(42)
utils.set_random_seed(seed=42)

2. Importing daily crime count with weather data.

In [39]:
df = pd.read_csv("../../../data/final-lrpd-data.csv")
import reduce_mem_usage
df = reduce_mem_usage.reduce_mem_usage(df)
regressors = ['is_holiday', 'dayofweek', 'quarter', 'month', 'year',
       'dayofyear', 'dayofmonth', 'weekofyear', 'is_weekend', 'is_weekday',
       'season', 'AWND', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN']
df = df[["ds", "y"] + regressors]
df["ds"] = pd.to_datetime(df["ds"])
df.set_index('ds')

train = df[:int(0.8 * (len(df)))]
test = df[int(0.8 * (len(df))):]
train.shape, test.shape

Memory usage of dataframe is 0.33 MB
Memory usage after optimizations: 0.07 MB
Decreased by 77.6%


((1793, 19), (449, 19))

3. Defining search space and objective to minimize RMSE on train - test split evaluation.

In [None]:
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

search_space = {
    'num_hidden_layers': hp.quniform('num_hidden_layers', 1, 30, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(1e-4), np.log(1)),
    'trend_reg': hp.loguniform('trend_reg', np.log(1e-4), np.log(10)),
    'seasonality_reg': hp.loguniform('seasonality_reg', np.log(1e-4), np.log(10)),
    'd_hidden': hp.quniform('d_hidden', 1, 50, 1),
    'ar_reg': hp.loguniform('ar_reg', np.log(1e-4), np.log(10))}

#Disable verbose for neuralprophet
import sys  
import logging
log = logging.getLogger("")
logging.disable(sys.maxsize)

#Minimal RMSE
def objective(params):
    print(params)
    m = NeuralProphet(**params, loss_func="MSE", yearly_seasonality=True)
    m = m.add_country_holidays(country_name='US')

    for regressor in regressors:
        m = m.add_future_regressor(regressor)
            
    m.fit(train)
    future = m.make_future_dataframe(train, periods=test.shape[0], regressors_df=test[regressors])
    forecast = m.predict(future)
    forecast_on_train = m.predict(train)
    
    toTest = test.merge(forecast[['ds', 'yhat1']], how='left', on='ds')
    rmse = mean_squared_error(toTest['y'], toTest['yhat1'], squared=False)
    print("RMSE: ", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=1,
            trials=trials)


4. Printing and saving optimized parameters.

In [None]:
print(best)
rmse = trials.best_trial['result']['loss']
with open(f'./hyperparameters/{rmse}_adreg_hyperopt.txt', 'w') as f:
    for key, value in best.items():
        f.write(f'{key}={value}\n')

In [61]:
print(trials.best_trial['result']['loss'])
print(trials.best_trial['misc']['vals'])

trials_df = pd.DataFrame(trials.trials)
trials_df['loss'] = trials_df['result'].apply(lambda x: x['loss'])
trials_df['params'] = trials_df['misc'].apply(lambda x: x['vals'])
trials_df = trials_df.sort_values(by='loss', ascending=True)

trials_df.to_csv(f'./hyperparameters/{rmse}_adreg_trials_hyperopt.csv', index=False)

7.952726462267565
{'ar_reg': [0.00325133799075842], 'd_hidden': [21.0], 'learning_rate': [0.0010380981438824243], 'num_hidden_layers': [22.0], 'seasonality_reg': [2.087540916891761], 'trend_reg': [0.012092073216673523]}
Second best
800
