# Forecasting crime with Facebook Prophet

#### 1. Import libraries and data

In [15]:
import pandas as pd 
import numpy as np 


from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.linear_model import Ridge

from prophet import Prophet 
from prophet.plot import plot_plotly, plot_components_plotly
import matplotlib.pyplot as plt 
%matplotlib inline

import itertools

import warnings
warnings.simplefilter(action='ignore', category= FutureWarning)

import holidays

import plotly.graph_objs as go

from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_yearly, add_changepoints_to_plot
plt.style.use('fivethirtyeight')

In [16]:
# Read the data
df = pd.read_csv('lrpd-clean.csv')


#### 2. Reducing memory helps us do feature engineering faster and more efficiently

In [17]:
# Reduce memory like we did in data cleanup
def reduce_mem_usage(df, category=False):
  start_mem = df.memory_usage().sum() / 1024 **2
  print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

  for col in df.columns:
    col_type = df[col].dtype

    if col_type != object:
      c_min = df[col].min()
      c_max = df[col].max()
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
          df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
          df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
          df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)
      else:
        if c_min  > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)
    else:
      if category:
        df[col] = df[col].astype('category')

  end_mem = df.memory_usage().sum() / 1024 ** 2
  print('Memory usage after optimizations: {:.2f} MB'.format(end_mem))
  print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
  return df

df = reduce_mem_usage(df)

Memory usage of dataframe is 10.60 MB
Memory usage after optimizations: 5.47 MB
Decreased by 48.4%


#### 3. Set correct data types

In [18]:
df["INCIDENT_DATE"] = pd.to_datetime(df["INCIDENT_DATE"])
df.dtypes

INCIDENT_NUMBER                object
INCIDENT_DATE          datetime64[ns]
LOCATION_DISTRICT             float16
OFFENSE_DESCRIPTION            object
WEAPON_TYPE                    object
ZIP                           float32
LATITUDE                      float16
LONGITUDE                     float16
WEEK_OF_MONTH                    int8
YEAR                            int16
DAY                              int8
DAY_OF_YEAR                     int16
MONTH                            int8
CRIME_TYPE                     object
RISK_TYPE                      object
RISK_TYPE_BC                     int8
dtype: object

#### 4. Prophet takes a dataframe with two columns: ds and y.
* The ds (datestamp) column should be of a format expected by Pandas, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp.
*  The y column must be numeric, and represents the measurement we wish to forecast.

In [19]:
df = df.groupby(pd.Grouper(key='INCIDENT_DATE', freq='D')).size().reset_index(name='INCIDENT_COUNT')
df.columns = ['ds', 'y']

In [20]:
len(df)

2242

#### 5. Make a baseline model using Prophet to see how well the model does without adding regressors, holidays or hyperparameter tuning.

In [21]:
m = Prophet()
m.fit(df)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)

df_cv = cross_validation(m, initial = '2000 days', period = '60 days', horizon='7 days')
df_p_baseline = performance_metrics(df_cv)
df_p_baseline.head()


13:34:33 - cmdstanpy - INFO - Chain [1] start processing
13:34:33 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 0/4 [00:00<?, ?it/s]13:34:34 - cmdstanpy - INFO - Chain [1] start processing
13:34:34 - cmdstanpy - INFO - Chain [1] done processing
 25%|██▌       | 1/4 [00:00<00:00,  4.72it/s]13:34:34 - cmdstanpy - INFO - Chain [1] start processing
13:34:34 - cmdstanpy - INFO - Chain [1] done processing
 50%|█████     | 2/4 [00:00<00:00,  5.47it/s]13:34:34 - cmdstanpy - INFO - Chain [1] start processing
13:34:34 - cmdstanpy - INFO - Chain [1] done processing
 75%|███████▌  | 3/4 [00:00<00:00,  5.47it/s]13:34:34 - cmdstanpy - INFO - Chain [1] start processing
13:34:34 - cmdstanpy - INFO - Chain [1] done processing
100%|██████████| 4/4 [00:00<00:00,  5.13it/s]


Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,smape,coverage
0,1 days,107.098094,10.348821,9.878269,0.307336,0.243401,0.266595,0.75
1,2 days,29.270193,5.410193,4.610489,0.133948,0.096332,0.122737,1.0
2,3 days,40.349251,6.352106,5.148252,0.138773,0.136381,0.136281,1.0
3,4 days,21.902924,4.680056,3.377001,0.112654,0.068754,0.100913,1.0
4,5 days,74.27986,8.618576,8.002118,0.250732,0.250488,0.221255,0.75


#### 6. Lets add holidays to the model to see how well it does.

In [22]:
# After zooming into the data, we see that arround christmas and 4th of july, we have an affect because of the holidays.
np.random.seed(42)
lockdowns = pd.DataFrame([
    {'holiday': 'lockdown_1', 'ds': '2020-03-21', 'lower_window': 0, 'ds_upper': '2020-06-06', 'upper_window': 77},
    {'holiday': 'lockdown_2', 'ds': '2020-07-09', 'lower_window': 0, 'ds_upper': '2020-10-27', 'upper_window': 110},
    {'holiday': 'lockdown_3', 'ds': '2021-02-13', 'lower_window': 0, 'ds_upper': '2021-02-17', 'upper_window': 4},
    {'holiday': 'lockdown_4', 'ds': '2021-05-28', 'lower_window': 0, 'ds_upper': '2021-06-10', 'upper_window': 13},
])

m3_changepoints = (
    # 10 potential changepoints in 2.5 years
    pd.date_range('2017-06-02', '2020-01-01', periods=10).date.tolist() +
    # 15 potential changepoints in 1 year 2 months
    pd.date_range('2020-02-01', '2021-04-01', periods=15).date.tolist()
)

df['pre_covid'] = pd.to_datetime(df['ds']) < pd.to_datetime('2020-03-21')
df['post_covid'] = ~df['pre_covid']

m = Prophet(seasonality_mode='multiplicative', weekly_seasonality=False, holidays=lockdowns, changepoints=m3_changepoints)

m.add_seasonality(
    name='weekly_pre_covid',
    period=7,
    fourier_order=3,
    condition_name='pre_covid',
)
m.add_seasonality(
    name='weekly_post_covid',
    period=7,
    fourier_order=3,
    condition_name='post_covid',
)

m.add_country_holidays(country_name='US')
m.fit(df)
future = m.make_future_dataframe(periods= 365)
future['pre_covid'] = pd.to_datetime(future['ds']) < pd.to_datetime('2020-03-21')
future['post_covid'] = ~future['pre_covid']
forecast = m.predict(future)

fig = go.Figure()

fig.add_trace(go.Scatter(x=df['ds'], y=df['y'], name='Actual',))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], name='Predicted',))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['holidays'].values, name='Holidays',))
fig.show()

13:34:35 - cmdstanpy - INFO - Chain [1] start processing
13:34:35 - cmdstanpy - INFO - Chain [1] done processing


Cross validate the model in order to see how well it does across many folds of the data.

In [23]:
df_cv = cross_validation(m, initial = '2000 days', period = '30 days', horizon='7 days')

  0%|          | 0/8 [00:00<?, ?it/s]13:34:36 - cmdstanpy - INFO - Chain [1] start processing
13:34:36 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▎        | 1/8 [00:00<00:05,  1.19it/s]13:34:36 - cmdstanpy - INFO - Chain [1] start processing
13:34:37 - cmdstanpy - INFO - Chain [1] done processing
 25%|██▌       | 2/8 [00:01<00:04,  1.35it/s]13:34:37 - cmdstanpy - INFO - Chain [1] start processing
13:34:38 - cmdstanpy - INFO - Chain [1] done processing
 38%|███▊      | 3/8 [00:02<00:03,  1.37it/s]13:34:38 - cmdstanpy - INFO - Chain [1] start processing
13:34:38 - cmdstanpy - INFO - Chain [1] done processing
 50%|█████     | 4/8 [00:02<00:02,  1.41it/s]13:34:39 - cmdstanpy - INFO - Chain [1] start processing
13:34:39 - cmdstanpy - INFO - Chain [1] done processing
 62%|██████▎   | 5/8 [00:03<00:02,  1.43it/s]13:34:39 - cmdstanpy - INFO - Chain [1] start processing
13:34:40 - cmdstanpy - INFO - Chain [1] done processing
 75%|███████▌  | 6/8 [00:04<00:01,  1.37it/s]13:34:40 - cmds

This is what cross validation looks like. Yhat represents the predicted target variable and y represents the actual target variable.

In [24]:
df_cv.head()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper,y,cutoff
0,2022-07-19,44.187901,34.809597,52.393772,36,2022-07-18
1,2022-07-20,43.569919,35.194043,52.794934,59,2022-07-18
2,2022-07-21,42.757421,33.448186,51.74203,40,2022-07-18
3,2022-07-22,44.843474,35.923245,53.810615,47,2022-07-18
4,2022-07-23,35.523604,25.691501,44.869652,38,2022-07-18


Evaluation metrics for each of the cutoff's in the cross validation

In [25]:
df_p = performance_metrics(df_cv)
df_p.head(5)

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,smape,coverage
0,1 days,77.332214,8.793874,8.022613,0.255657,0.250204,0.222591,0.5
1,2 days,53.201349,7.293925,5.326048,0.134885,0.105187,0.130387,0.75
2,3 days,40.41643,6.357392,5.596831,0.140108,0.133979,0.143226,0.75
3,4 days,29.413888,5.423457,4.306001,0.112152,0.108631,0.115413,0.875
4,5 days,54.109528,7.355918,5.778633,0.21234,0.085474,0.177634,0.625


## Putting it all together

* Now we can start putting everything together by hyperparameter tuning, adding holidays, and eventually adding weather data as well.

In [26]:
df = pd.read_csv('lrpd-clean.csv')
df['INCIDENT_DATE'] = pd.to_datetime(df['INCIDENT_DATE'])
df = df.groupby(pd.Grouper(key='INCIDENT_DATE', freq='D')).size().reset_index(name='INCIDENT_COUNT')
df.columns = ['ds', 'y']
df.shape

(2242, 2)

In [27]:
df['pre_covid'] = pd.to_datetime(df['ds']) < pd.to_datetime('2020-03-21')
df['post_covid'] = ~df['pre_covid']

In [28]:
param_grid = {  
    'changepoint_prior_scale': [0.001, 0.5],
    'seasonality_prior_scale': [0.01, 10],
    'holidays_prior_scale': [0.01, 10],
    'seasonality_mode': ['additive', 'multiplicative'],
    'daily_seasonality': [True, False],
    'weekly_seasonality': [False],
    'yearly_seasonality': [True, False],
    'holidays': [lockdowns],
    'changepoints': [m3_changepoints]
}

all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []

len(all_params)

64

I commented out this cell to save time on finding the best parameters as it would take a long time to run.

If you would like to run it, uncomment it and run it. 

Make sure to comment the next cell.

In [29]:
for params in all_params:
    m = Prophet(**params)
    m.add_seasonality(
    name='weekly_pre_covid',
    period=7,
    fourier_order=3,
    condition_name='pre_covid',
    )
    m.add_seasonality(
        name='weekly_post_covid',
        period=7,
        fourier_order=3,
        condition_name='post_covid',
    )
    m.fit(df)
    df_cv = cross_validation(m, initial = '2000 days', period = '60 days', horizon='7 days', parallel="processes")
    df_p = performance_metrics(df_cv, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

# Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print(tuning_results)

# Python
best_params = all_params[np.argmin(rmses)]
print(best_params)

13:34:42 - cmdstanpy - INFO - Chain [1] start processing
13:34:42 - cmdstanpy - INFO - Chain [1] done processing
13:34:43 - cmdstanpy - INFO - Chain [1] start processing
13:34:43 - cmdstanpy - INFO - Chain [1] start processing
13:34:43 - cmdstanpy - INFO - Chain [1] start processing
13:34:43 - cmdstanpy - INFO - Chain [1] start processing
13:34:43 - cmdstanpy - INFO - Chain [1] done processing
13:34:43 - cmdstanpy - INFO - Chain [1] done processing
13:34:43 - cmdstanpy - INFO - Chain [1] done processing
13:34:44 - cmdstanpy - INFO - Chain [1] done processing
13:34:44 - cmdstanpy - INFO - Chain [1] start processing
13:34:44 - cmdstanpy - INFO - Chain [1] done processing
13:34:45 - cmdstanpy - INFO - Chain [1] start processing
13:34:45 - cmdstanpy - INFO - Chain [1] start processing
13:34:45 - cmdstanpy - INFO - Chain [1] start processing
13:34:45 - cmdstanpy - INFO - Chain [1] start processing
13:34:45 - cmdstanpy - INFO - Chain [1] done processing
13:34:45 - cmdstanpy - INFO - Chain [1

    changepoint_prior_scale  seasonality_prior_scale  holidays_prior_scale   
0                     0.001                     0.01                  0.01  \
1                     0.001                     0.01                  0.01   
2                     0.001                     0.01                  0.01   
3                     0.001                     0.01                  0.01   
4                     0.001                     0.01                  0.01   
..                      ...                      ...                   ...   
59                    0.500                    10.00                 10.00   
60                    0.500                    10.00                 10.00   
61                    0.500                    10.00                 10.00   
62                    0.500                    10.00                 10.00   
63                    0.500                    10.00                 10.00   

   seasonality_mode  daily_seasonality  weekly_seasonality   
0

13:38:32 - cmdstanpy - INFO - Chain [1] done processing
13:38:32 - cmdstanpy - INFO - Chain [1] done processing


#### We fit the model with the best parameters, and the holidays data.

In [30]:
m = Prophet(**best_params)
m.add_country_holidays(country_name='US')
m.add_seasonality(
    name='weekly_pre_covid',
    period=7,
    fourier_order=3,
    condition_name='pre_covid',
)
m.add_seasonality(
    name='weekly_post_covid',
    period=7,
    fourier_order=3,
    condition_name='post_covid',
)
m.fit(df)
future = m.make_future_dataframe(periods= 365)
future['pre_covid'] = pd.to_datetime(future['ds']) < pd.to_datetime('2020-03-21')
future['post_covid'] = ~future['pre_covid']
forecast = m.predict(future)
df_cv = cross_validation(m, initial = '2000 days', period = '60 days', horizon='7 days', parallel="processes")
df_p = performance_metrics(df_cv, rolling_window=1)


13:38:33 - cmdstanpy - INFO - Chain [1] start processing
13:38:33 - cmdstanpy - INFO - Chain [1] done processing
13:38:34 - cmdstanpy - INFO - Chain [1] start processing
13:38:34 - cmdstanpy - INFO - Chain [1] start processing
13:38:34 - cmdstanpy - INFO - Chain [1] start processing
13:38:34 - cmdstanpy - INFO - Chain [1] start processing
13:38:34 - cmdstanpy - INFO - Chain [1] done processing
13:38:34 - cmdstanpy - INFO - Chain [1] done processing
13:38:35 - cmdstanpy - INFO - Chain [1] done processing
13:38:35 - cmdstanpy - INFO - Chain [1] done processing


#### We add weather data from the local Little Rock airport weather station to the model

It is important to note that we will also need the weather data for the future in order to make forecasts on the future. 

We can obtain that data by using a weather data API to get the weather forecast for the horizon.

Add the regressors to the model and train it again.

In [31]:
finalDf = pd.read_csv('final-lrpd-data.csv')
list2 = ['is_holiday', 'dayofweek', 'quarter', 'month', 'year',
       'dayofyear', 'dayofmonth', 'weekofyear', 'is_weekend', 'is_weekday',
       'season', 'AWND', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN']
estimator = Ridge()
selector =  RFECV(estimator, step=10, cv=10)
selector = selector.fit(finalDf[list2], finalDf['y'])
to_keep = finalDf[list2].columns[selector.support_]

np.random.seed(42)
param_grid = {  
    'changepoint_prior_scale': [0.001, 0.5],
    'seasonality_prior_scale': [0.01, 10],
    'holidays_prior_scale': [0.01, 10],
    'seasonality_mode': ['additive', 'multiplicative'],
    'daily_seasonality': [True, False],
    'weekly_seasonality': [False],
    'yearly_seasonality': [True, False],
    'holidays': [lockdowns],
    'changepoints': [m3_changepoints]
}

all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []

for params in all_params:
    m = Prophet(**params)
    m.add_country_holidays(country_name='US')
    m.add_seasonality(
    name='weekly_pre_covid',
    period=7,
    fourier_order=3,
    condition_name='pre_covid',
    )
    m.add_seasonality(
        name='weekly_post_covid',
        period=7,
        fourier_order=3,
        condition_name='post_covid',
    )
    # add regressors to the dataframe
    for f in to_keep:    
        df[f] = finalDf[f]
        m.add_regressor(f)

    m.fit(df)
    df_cv = cross_validation(m, initial = '2000 days', period = '60 days', horizon='7 days', parallel="processes")
    df_p = performance_metrics(df_cv, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

# Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print(tuning_results)

# Python
best_params = all_params[np.argmin(rmses)]
print(best_params)

13:38:35 - cmdstanpy - INFO - Chain [1] start processing
13:38:35 - cmdstanpy - INFO - Chain [1] done processing
13:38:36 - cmdstanpy - INFO - Chain [1] start processing
13:38:37 - cmdstanpy - INFO - Chain [1] start processing
13:38:37 - cmdstanpy - INFO - Chain [1] start processing
13:38:37 - cmdstanpy - INFO - Chain [1] start processing
13:38:37 - cmdstanpy - INFO - Chain [1] done processing
13:38:37 - cmdstanpy - INFO - Chain [1] done processing
13:38:37 - cmdstanpy - INFO - Chain [1] done processing
13:38:37 - cmdstanpy - INFO - Chain [1] done processing
13:38:37 - cmdstanpy - INFO - Chain [1] start processing
13:38:37 - cmdstanpy - INFO - Chain [1] done processing
13:38:38 - cmdstanpy - INFO - Chain [1] start processing
13:38:38 - cmdstanpy - INFO - Chain [1] start processing
13:38:38 - cmdstanpy - INFO - Chain [1] start processing
13:38:39 - cmdstanpy - INFO - Chain [1] start processing
13:38:39 - cmdstanpy - INFO - Chain [1] done processing
13:38:39 - cmdstanpy - INFO - Chain [1

    changepoint_prior_scale  seasonality_prior_scale  holidays_prior_scale   
0                     0.001                     0.01                  0.01  \
1                     0.001                     0.01                  0.01   
2                     0.001                     0.01                  0.01   
3                     0.001                     0.01                  0.01   
4                     0.001                     0.01                  0.01   
..                      ...                      ...                   ...   
59                    0.500                    10.00                 10.00   
60                    0.500                    10.00                 10.00   
61                    0.500                    10.00                 10.00   
62                    0.500                    10.00                 10.00   
63                    0.500                    10.00                 10.00   

   seasonality_mode  daily_seasonality  weekly_seasonality   
0

13:44:38 - cmdstanpy - INFO - Chain [1] done processing


In [32]:
best_params

{'changepoint_prior_scale': 0.5,
 'seasonality_prior_scale': 0.01,
 'holidays_prior_scale': 0.01,
 'seasonality_mode': 'multiplicative',
 'daily_seasonality': True,
 'weekly_seasonality': False,
 'yearly_seasonality': False,
 'holidays':       holiday         ds  lower_window    ds_upper  upper_window
 0  lockdown_1 2020-03-21             0  2020-06-06            77
 1  lockdown_2 2020-07-09             0  2020-10-27           110
 2  lockdown_3 2021-02-13             0  2021-02-17             4
 3  lockdown_4 2021-05-28             0  2021-06-10            13,
 'changepoints': [datetime.date(2017, 6, 2),
  datetime.date(2017, 9, 14),
  datetime.date(2017, 12, 28),
  datetime.date(2018, 4, 12),
  datetime.date(2018, 7, 26),
  datetime.date(2018, 11, 7),
  datetime.date(2019, 2, 20),
  datetime.date(2019, 6, 5),
  datetime.date(2019, 9, 18),
  datetime.date(2020, 1, 1),
  datetime.date(2020, 2, 1),
  datetime.date(2020, 3, 2),
  datetime.date(2020, 4, 1),
  datetime.date(2020, 5, 2),
  

In [33]:

m = Prophet(**best_params)
m.add_country_holidays(country_name='US')
m.add_seasonality(
    name='weekly_pre_covid',
    period=7,
    fourier_order=3,
    condition_name='pre_covid',
)
m.add_seasonality(
    name='weekly_post_covid',
    period=7,
    fourier_order=3,
    condition_name='post_covid',
)
# add regressors to the dataframe
for f in to_keep:    
    df[f] = finalDf[f]
    m.add_regressor(f)

m.fit(df)

13:44:39 - cmdstanpy - INFO - Chain [1] start processing
13:44:40 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x1734c8210>

In [34]:
df_cv = cross_validation(m, initial = '2000 days', period = '60 days', horizon='7 days')
df_p2 = performance_metrics(df_cv, rolling_window=1)

  0%|          | 0/4 [00:00<?, ?it/s]13:44:40 - cmdstanpy - INFO - Chain [1] start processing
13:44:41 - cmdstanpy - INFO - Chain [1] done processing
 25%|██▌       | 1/4 [00:01<00:03,  1.23s/it]13:44:42 - cmdstanpy - INFO - Chain [1] start processing
13:44:43 - cmdstanpy - INFO - Chain [1] done processing
 50%|█████     | 2/4 [00:02<00:02,  1.44s/it]13:44:43 - cmdstanpy - INFO - Chain [1] start processing
13:44:44 - cmdstanpy - INFO - Chain [1] done processing
 75%|███████▌  | 3/4 [00:04<00:01,  1.37s/it]13:44:44 - cmdstanpy - INFO - Chain [1] start processing
13:44:46 - cmdstanpy - INFO - Chain [1] done processing
100%|██████████| 4/4 [00:05<00:00,  1.37s/it]


#### Now we compare the metrics for all 3 stages of model fitting and see how they compare. 
As we can see, the model with the weather data performs the best.

In [37]:
comparison = pd.DataFrame()
comparison['baseline'] = df_p_baseline.mean(axis = 0)[1:]
comparison['covid'] = df_p.mean(axis = 0)[1:]
comparison['covid_and_weather'] = df_p2.mean(axis = 0)[1:]
print(comparison)


           baseline      covid covid_and_weather
mse       52.607482  47.663533         43.323471
rmse       6.881179   6.903878          6.582057
mae         5.89508   6.026903          5.590408
mape          0.183   0.182032          0.169657
mdape      0.147671   0.162586          0.157055
smape      0.164388   0.170241           0.15701
coverage   0.892857   0.714286          0.821429


In [38]:
# Plot the comparison of the three models
fig = go.Figure()
fig.add_trace(go.Scatter(x=comparison.index, y=comparison['baseline'].values, name='Baseline',))
fig.add_trace(go.Scatter(x=comparison.index, y=comparison['covid'].values, name='Covid',))
fig.add_trace(go.Scatter(x=comparison.index, y=comparison['covid_and_weather'].values, name='Covid and Weather',))
fig.update_layout(title='Comparison of the three models', xaxis_title='Metric', yaxis_title='Value')
fig.show()