In [1]:
from typing import Tuple

import itertools
import logging

import numpy
import pandas

from matplotlib import pyplot as plt

from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.serialize import model_to_json, model_from_json

from sklearn.metrics import mean_squared_error

In [2]:
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

logger = logging.getLogger('prophet')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

logger = logging.getLogger('fbprophet')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

In [3]:
country = 'france'

df = pandas.read_excel(io = f'../../../data/processed/{country}.xlsx')
df_covid_measures = pandas.read_csv(
    filepath_or_buffer = f'../../../data/intermediate/{country}/government_covid_measures.csv'
)

In [4]:
df_covid_workplace_lockdowns = df_covid_measures[['date', 'workplace_closing']]
df_covid_workplace_lockdowns.to_csv('covid_workplace_lockdowns.csv', index = False)

In [5]:
def make_dataset(df_processed: pandas.DataFrame, df_covid_measures: pandas.DataFrame = pandas.DataFrame()) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
    df_mrd = df_processed[['Time', 'Unemployment_Rate_TOT']].rename(
        columns = {'Time': 'ds', 'Unemployment_Rate_TOT': 'y'}
    )
    df_mrd = df_mrd.drop(index = df_mrd[pandas.isnull(df_mrd['y'])].index, inplace = False)

    # manually created from df_covid_workplace_lockdowns
    df_workplace_lockdowns = pandas.DataFrame([
        {'holiday': 'lockdown_1_1', 'ds': '2020-06-22', 'lower_window': 0, 'ds_upper': '2020-08-13'},
        {'holiday': 'lockdown_2_1', 'ds': '2022-03-14', 'lower_window': 0, 'ds_upper': '2022-06-19'},
        {'holiday': 'lockdown_2_2', 'ds': '2020-05-11', 'lower_window': 0, 'ds_upper': '2020-06-21'},
        {'holiday': 'lockdown_2_3', 'ds': '2020-08-14', 'lower_window': 0, 'ds_upper': '2020-10-29'},
        {'holiday': 'lockdown_2_4', 'ds': '2020-11-28', 'lower_window': 0, 'ds_upper': '2021-02-25'},
        {'holiday': 'lockdown_2_5', 'ds': '2021-03-02', 'lower_window': 0, 'ds_upper': '2021-03-04'},
        {'holiday': 'lockdown_2_6', 'ds': '2021-04-03', 'lower_window': 0, 'ds_upper': '2021-05-02'},
        {'holiday': 'lockdown_2_7', 'ds': '2021-05-19', 'lower_window': 0, 'ds_upper': '2022-03-13'},
        {'holiday': 'lockdown_3_1', 'ds': '2020-03-17', 'lower_window': 0, 'ds_upper': '2020-05-10'},
        {'holiday': 'lockdown_3_2', 'ds': '2020-10-30', 'lower_window': 0, 'ds_upper': '2020-11-27'},
        {'holiday': 'lockdown_3_3', 'ds': '2021-02-26', 'lower_window': 0, 'ds_upper': '2021-03-01'},
        {'holiday': 'lockdown_3_4', 'ds': '2021-03-05', 'lower_window': 0, 'ds_upper': '2021-04-02'},
        {'holiday': 'lockdown_3_5', 'ds': '2021-05-03', 'lower_window': 0, 'ds_upper': '2021-05-18'},
    ])
    for t_col in ['ds', 'ds_upper']:
        df_workplace_lockdowns[t_col] = pandas.to_datetime(df_workplace_lockdowns[t_col], format = '%Y-%m-%d')
    df_workplace_lockdowns['upper_window'] = (df_workplace_lockdowns['ds_upper'] - df_workplace_lockdowns['ds']).dt.days
    df_workplace_lockdowns = df_workplace_lockdowns.sort_values(by = 'ds', axis = 'index', ascending = True, inplace= False)

    return df_mrd, df_workplace_lockdowns

In [6]:
df_mrd, df_workplace_lockdowns = make_dataset(df)

In [7]:
def train_test_split(df_mrd: pandas.DataFrame, test_size: int = 12) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
    df_test = df_mrd.tail(test_size)
    df_train = df_mrd.drop(index = df_mrd.tail(test_size).index, inplace = False)
    return df_train, df_test

In [8]:
df_train, df_test = train_test_split(df_mrd, 12)

In [9]:
def train_model(df: pandas.DataFrame, df_holidays: pandas.DataFrame, param_grid: dict, eval_metric: str = 'rmse'):

    # Generate all combinations of parameters
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
    eval_metric_values = []  # Store the eval_metric_values for each params here

    # other params
    model_params = {
        'growth': 'linear', 'seasonality_mode': 'additive', 'holidays': df_holidays,
        "changepoint_range": 0.9, "yearly_seasonality": "auto"}
    cv_params = {'horizon': '30 days', 'parallel': 'processes'}
    pm_params = {'rolling_window': 1}

    print(f'Total hyper-parameter set count: {len(all_params)}')

    # Use cross validation to evaluate all parameters
    iteration_count: int = 1
    for hyper_params in all_params:
        print(f'Set: {iteration_count}')
        print(f'{hyper_params}')

        m = Prophet(**hyper_params, **model_params).fit(df)
        df_cv = cross_validation(m, **cv_params)
        df_p = performance_metrics(df_cv, **pm_params)
        eval_metric_values.append(df_p[eval_metric].values[0])

        print(f'{eval_metric.upper()}: {df_p[eval_metric].values[0]}')
        iteration_count += 1
        print()

    best_params = all_params[numpy.argmin(eval_metric_values)]

    print('Training model on the best hyper-parameter set.')
    print(f'{best_params}')
    
    auto_model = Prophet(**best_params, **model_params).fit(df)

    print('Cross-Validating best model.')
    auto_model_cv = cross_validation(auto_model, **cv_params)
    auto_model_p = performance_metrics(auto_model_cv, **pm_params)

    return auto_model, auto_model_p, best_params

In [10]:
param_grid = {  
    'changepoint_prior_scale': [0.1, 0.5, 0.75, 0.9],
    'seasonality_prior_scale': [0.1, 1.0],
    'yearly_seasonality': [True, False],
}

# auto_model, auto_model_p, best_params = train_model(df_train, df_workplace_lockdowns, param_grid)
auto_model, auto_model_p, best_params = train_model(df_train, df_workplace_lockdowns, param_grid)

Total hyper-parameter set count: 8
Set: 1
{'changepoint_prior_scale': 0.5, 'seasonality_prior_scale': 0.1, 'holidays_prior_scale': 0.1}


In [None]:
def test_model(df_test: pandas.DataFrame, model: Prophet) -> Tuple[pandas.DataFrame, float]:
    "return predicted values and rmse"
    df_predicted: pandas.DataFrame = model.predict(df_test)
    rmse: float = mean_squared_error(y_true = df_test['y'], y_pred = df_predicted['yhat'], squared = False)
    return df_predicted, rmse

In [None]:
df_predicted, rmse = test_model(df_test, auto_model)

In [None]:
df_predicted

In [None]:
rmse

In [None]:
with open('france_prophet_lockdowns_optimized_model.json', 'w') as f:
    f.write(model_to_json(auto_model))

In [None]:
df_future = pandas.DataFrame(data = {'ds': ['2023-03-01', '2023-04-01', '2023-05-01']})

In [None]:
df_future_prediction: pandas.DataFrame = auto_model.predict(df_future)

In [None]:
df_future_prediction