In [9]:
# pip install greykite

In [10]:


%matplotlib inline

In [11]:
import pandas as pd 
import numpy as np

In [12]:
import plotly
import plotly.graph_objs as go
plotly.io.renderers.default = 'notebook'

from collections import defaultdict

from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results



In [13]:
url = 'https://drive.google.com/file/d/1ezwjtCirkOyUwU66PMb85DJhKAgcBOfz/view?usp=sharing'
path = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
shelter_per_day = pd.read_csv(path, parse_dates=['date'])


In [14]:
shelter_per_day = shelter_per_day.set_index('date', drop=False)
df = shelter_per_day[['date','Total']]

In [15]:
df = df[df['Total'] != 0]

In [16]:
df.head()

Unnamed: 0_level_0,date,Total
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-01-01,2005-01-01,393
2005-01-02,2005-01-02,451
2005-01-03,2005-01-03,466
2005-01-04,2005-01-04,469
2005-01-05,2005-01-05,498


In [17]:
# use data from 2006 to 2018-12-31 as train and 2019-01-01 to 2020-03-31
df = df[:]['2006':'2020-03-31']

In [18]:
# Specifies dataset information
metadata = MetadataParam(
    time_col="date",
    value_col="Total",  
    freq="D"
)

# Forecast horizon
forecast_horizon = 455

# coverage of the prediction interval ()
coverage = 0.95

forecaster = Forecaster()

result = forecaster.run_forecast_config(
    df=df,
    config=ForecastConfig(
        model_template = 'SILVERKITE',
        forecast_horizon = forecast_horizon,  # forecasts 365 steps ahead
        coverage = coverage,  # 95% prediction intervals
        metadata_param=metadata
    )
)


Fitting 3 folds for each of 1 candidates, totalling 3 fits



Requested holiday 'Easter Monday [England, Wales, Northern Ireland]' does not occur in the provided countries


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


Requested holiday 'Easter Monday [England, Wales, Northern Ireland]' does not occur in the provided countries


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy(

In [19]:
# plot original time series
ts = result.timeseries
fig = ts.plot()
layout = go.Layout(autosize=False, width=750, height=500,)
fig.update(layout=layout)
plotly.io.show(fig)

In [20]:
# Plot forecast on the holdout test set
test = result.backtest
fig = test.plot()
layout = go.Layout(autosize=False, width=1000, height=500,)
fig.update(layout=layout)
plotly.io.show(fig)

In [21]:
def show_fit_results(backtest):
  backtest_eval = defaultdict(list)
  for metric, value in backtest.train_evaluation.items():
      backtest_eval[metric].append(value)
      backtest_eval[metric].append(backtest.test_evaluation[metric])
  metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T
  return metrics

In [22]:
show_fit_results(test)

Unnamed: 0,train,test
CORR,0.810508,0.891802
R2,0.640402,0.513603
MSE,11530.288124,28357.406124
RMSE,107.379179,168.396574
MAE,80.498933,150.257695
MedAE,61.325214,152.389482
MAPE,5.861651,10.378363
MedAPE,4.603877,10.509593
sMAPE,2.908161,5.025414
Q80,40.249467,65.132972


In [23]:
forecast = result.forecast
fig = forecast.plot()
layout = go.Layout(autosize=False, width=1000, height=500,)
fig.update(layout=layout)
plotly.io.show(fig)

In [24]:
fig = forecast.plot_components()
layout = go.Layout(autosize=False, width=1000, height=1200,)
fig.update(layout=layout)
plotly.io.show(fig)

## Trend change detection

In [25]:
from greykite.algo.changepoint.adalasso.changepoint_detector import ChangepointDetector
model = ChangepointDetector()
res = model.find_trend_changepoints(
    df=df,  
    time_col="date",  
    value_col="Total",  
    yearly_seasonality_order=10,  
    regularization_strength=0.5, 
    resample_freq="7D", 
    potential_changepoint_n=25,  
    yearly_seasonality_change_freq="365D",  
    no_changepoint_distance_from_end="365D") 
fig = model.plot(
    observation=True,
    trend_estimate=False,
    trend_change=True,
    yearly_seasonality_estimate=False,
    adaptive_lasso_estimate=True,
    plot=False)
plotly.io.show(fig)

In [26]:
# The following specifies the growth and trend changepoint configurations.
growth = {
    "growth_term": "linear"
}
changepoints = {
    "changepoints_dict": dict(
        method="auto",
        yearly_seasonality_order=10,
        regularization_strength=0.5,
        resample_freq="7D",
        potential_changepoint_n=25,
        yearly_seasonality_change_freq="365D",
        no_changepoint_distance_from_end="365D"
    )
}

## Seasonality

In [27]:
# Includes yearly seasonality with order 10 and weekly seasonality with order 5.
 # Set the other seasonality to False to disable them.
yearly_seasonality_order = 10
seasonality = {
    "yearly_seasonality": yearly_seasonality_order,
    "quarterly_seasonality": False,
    "monthly_seasonality": False,
    "weekly_seasonality": False,
    "daily_seasonality": False
}

## Holidays and events (include snow)

In [28]:
weather = pd.read_csv('https://raw.githubusercontent.com/DataKind-DC/DC-DHS-Public/main/external%20data/weatherdata.csv', parse_dates=['date'])

In [29]:
snow = weather[weather['SNOW_value'] > 0]
snow['event_name'] = ["snow"] * len(snow)
snow = snow[['date','event_name']]

In [30]:
# Includes major holidays and snow date.

from greykite.algo.forecast.silverkite.constants.silverkite_holiday import SilverkiteHoliday

events = {
    "holidays_to_model_separately": SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES,  # all holidays in "holiday_lookup_countries"
    "holiday_lookup_countries": ["UnitedStates"],  # only look up holidays in the United States
    "holiday_pre_num_days": 2,  # also mark the 2 days before a holiday as holiday
    "holiday_post_num_days": 2,  # also mark the 2 days after a holiday as holiday
    "daily_event_df_dict": { "snow": snow}
}

# Test together

In [31]:
from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam

model_components = ModelComponentsParam(
    seasonality=seasonality,
    growth=growth,
    events=events,
    changepoints=changepoints,
    autoregression=None,
    uncertainty={
        "uncertainty_dict": "auto",
    },
    custom={
        "fit_algorithm_dict": {
            "fit_algorithm": "ridge",
        },
    }
)

In [32]:
# Defines the cross-validation config
from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam
evaluation_period = EvaluationPeriodParam(
    test_horizon=365,            
    cv_horizon=365,              
    cv_max_splits=3,             
    cv_min_train_periods=365 * 4  
)


# Runs the forecast
result = forecaster.run_forecast_config(
    df=df,
    config=ForecastConfig(
        model_template=ModelTemplateEnum.SILVERKITE.name,
        forecast_horizon=365,  # forecasts 365 steps ahead
        coverage=0.95,  # 95% prediction intervals
        metadata_param=metadata,
        model_components_param=model_components,
        evaluation_period_param=evaluation_period
    )
)

Fitting 3 folds for each of 1 candidates, totalling 3 fits



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [33]:
# Plot forecast on the holdout test set
test = result.backtest
fig = test.plot()
layout = go.Layout(autosize=False, width=1000, height=500,)
fig.update(layout=layout)
plotly.io.show(fig)

In [34]:
forecast = result.forecast
fig = forecast.plot()
layout = go.Layout(autosize=False, width=1000, height=500,)
fig.update(layout=layout)
plotly.io.show(fig)