# Predict CMSA count - basic

## Preparations

In [1]:
%%capture
get_ipython().run_cell_magic('bash', '', 'pip install psycopg2-binary\npip install xgboost\npip install graphviz \npip install workalendar')

CalledProcessError: Command 'b'pip install psycopg2-binary\npip install xgboost\npip install graphviz \npip install workalendar\n'' returned non-zero exit status 1.

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os

from sqlalchemy import create_engine
import env

from datetime import datetime, timedelta, date
import pytz
from workalendar.europe import Netherlands

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_error
from sklearn import linear_model
import xgboost as xgb
from scipy.signal import savgol_filter

import helpers_cmsa as h 
import importlib

from pyspark.sql import SparkSession
from pyspark.sql.functions import substring, length, col, expr
from pyspark.sql.types import *

import requests # for API covid

ModuleNotFoundError: No module named 'geopandas'

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

import graphviz

In [None]:
pd.set_option('mode.chained_assignment', None)

In [None]:
spark = SparkSession \
    .builder \
    .getOrCreate()

## Settings

In [None]:
# create engine for SQL queries
engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(env.DATABASE_USERNAME_AZ, 
                                                            env.DATABASE_PASSWORD_AZ, 
                                                            "igordb.postgres.database.azure.com", 
                                                            5432, 
                                                            "igor"),
                       connect_args={'sslmode':'require'})

In [None]:
# select moment to start for operational forecast # CHANGE EVERY WEEK
start_prediction = pd.to_datetime('2021-08-16 00:00:00').tz_localize("Europe/Amsterdam")
my_week = 33

In [None]:
# how many days to predict for operational forecast
predict_days = 7

In [None]:
# input for dummy model: how many 15mins to shift realization
my_shift = 4*24*7  

In [None]:
# input for starting of learnset 
start_learnset = pd.to_datetime('2020-09-01 00:00:00').tz_localize("Europe/Amsterdam")

In [None]:
# select which locations to forecast
my_locations = [
                'GAVM-02-Stadhouderskade', # Vondelpark
                'GAAM-01-AlbertCuypstraat',  
                'CMSA-GAKH-01',  # Kalverstraat, previously 'GKS-01-Kalverstraat'
                'GACM-02' # Nieuwendijk
               ]  

In [None]:
# set versions
current_model_version = 'xg_0_3'
current_data_version = "2_0"  # from 09/02/2021

In [None]:
#covid_url = 'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/2020-09-01/2021-07-12' # change 

today = pd.to_datetime("today")
today_str = str(today.year) + "-" + str(today.month) + "-" + str(today.day)
covid_url = 'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/2020-09-01/' + today_str

In [None]:
# define vacations
kerst_19 = pd.DataFrame(data = {'date': pd.date_range(date(2019, 12, 21), periods = 7*2 + 2, freq='1d')})
voorjaar_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 2, 15), periods = 9, freq='1d')})
mei_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 4, 25), periods = 9, freq='1d')})
zomer_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 7, 4), periods = 7*6 + 2, freq='1d')})
herfst_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 10, 10), periods = 9, freq='1d')})
kerst_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 12, 19), periods = 7*2 + 2, freq='1d')})
voorjaar_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 2, 20), periods = 9, freq='1d')})
mei_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 5, 1), periods = 9, freq='1d')})
zomer_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 7, 10), periods = 7*6 + 2, freq='1d')})
herfst_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 10, 16), periods = 9, freq='1d')})
kerst_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 12, 25), periods = 7*2 + 2, freq='1d')})

In [None]:
# selection of features to include for models
x_cols = ['weekday__5', 'weekday__6', 
          'weekend',
          'weekday__0', 'weekday__1', 'weekday__2', 'weekday__3','weekday__4',
          'sin_time', 'cos_time',
          'hour__0', 'hour__1', 'hour__2', 'hour__3', 'hour__4', 'hour__5', 'hour__6', 'hour__7', 
          'hour__8', 'hour__9', 'hour__10', 'hour__11', 'hour__12', 'hour__13', 'hour__14', 'hour__15', 
          'hour__16', 'hour__17', 'hour__18', 'hour__19', 'hour__20', 'hour__21', 'hour__22', 'hour__23',
          'stringency', 'shopping_restricted', #'stringency_legacy',
          'wind_speed', 'temperature', 'global_radiation', 'cloud_cover',
          'vacation_dummy',
          'holiday_dummy'
         ]  

## Main

### 1. Get data

In [None]:
print('Start loading raw data') 

In [None]:
cmsa_query = "SELECT * FROM ingested.cmsa limit 3000000"
cmsa_df_raw = pd.read_sql_query(cmsa_query, con = engine)

In [None]:
#cmsa_df_raw['location_id'].unique() 

In [None]:
covid_df_raw = pd.DataFrame(requests.get(url = covid_url).json()['data'])

In [None]:
holidays_data_raw = Netherlands().holidays(2020) + Netherlands().holidays(2021) 

In [None]:
vacation_df_raw = kerst_19.append([voorjaar_20, mei_20, zomer_20, herfst_20, kerst_20,
                                   voorjaar_21, mei_21, zomer_21, herfst_21, kerst_21])

In [None]:
knmi_obs = spark.read.format("json").load("s3a://knmi-knmi/topics/knmi-observations/2021/06/*/*") # (complete history stored in 06/07)
knmi_obs_df_raw = knmi_obs.toPandas()

In [None]:
knmi_pred6 = spark.read.format("json").option("header", "true").load("s3a://knmi-knmi/topics/knmi/2021/06/*/*.json.gz", sep = ";")
knmi_pred7 = spark.read.format("json").option("header", "true").load("s3a://knmi-knmi/topics/knmi/2021/07/*/*.json.gz", sep = ";")
knmi_pred8 = spark.read.format("json").option("header", "true").load("s3a://knmi-knmi/topics/knmi/2021/08/*/*.json.gz", sep = ";")
knmi_pred = knmi_pred6.union(knmi_pred7)
knmi_pred = knmi_pred.union(knmi_pred8)
knmi_pred_df_raw = knmi_pred.limit(5000000).toPandas()

### 2. Prepare data

In [None]:
print('Start pre-processing data')

In [None]:
#importlib.reload(h)

In [None]:
cmsa_df = h.preprocess_cmsa_data(cmsa_df_raw, my_locations, start_learnset) 

In [None]:
covid_df = h.preprocess_covid_data_api(covid_df_raw)

In [None]:
holiday_df = h.preprocess_holidays_data(holidays_data_raw)

In [None]:
vacation_df = h.preprocess_vacation_data(vacation_df_raw)

In [None]:
knmi_obs_df = h.preprocess_knmi_data(knmi_obs_df_raw, start_learnset) 
knmi_pred_df = h.preprocess_metpre_data(knmi_pred_df_raw, start_learnset) 

knmi_df = h.preprocess_weather_data(knmi_obs_df, knmi_pred_df)  # later add above functions to this function

### 3. Clean data

In [None]:
print('Start cleaning data')

In [None]:
cmsa_df_clean = h.clean_cmsa_data(cmsa_df)

In [None]:
cmsa_df_clean = cmsa_df_clean[(cmsa_df_clean.index < '2021-08-08 00:00:00') | (cmsa_df_clean.index > '2021-08-09 12:00:00')] # TEMPORARY

In [None]:
fig, ax = plt.subplots(1, figsize = (12,3))
cmsa_df_clean.tail(1000).plot(ax=ax)

In [None]:
# Import CSV data as back-up
cmsa_vp = pd.read_csv('data/Stadhouderskade Totaal Aantal passanten_per 15 minuten-data-2021-08-02 14_37_33.csv')
cmsa_ac = pd.read_csv('data/Albert Cuyp Totaal Aantal passanten_per 15 minuten-data-2021-08-02 14_36_51.csv')
cmsa_ws = pd.read_csv('data/Winkelstraten Totaal aantal passanten alle sensoren-data-as-seriestocolumns-2021-08-02 14_38_31-Corrected.csv', sep = ";")

In [None]:
cmsa_df_clean_fill = h.fill_cmsa_gaps_csv(cmsa_df_clean, cmsa_vp, cmsa_ac, cmsa_ws, start_learnset)
# 1: database data used, 0: csv data used, -1: value missing in both, NaN used

In [None]:
fig, ax = plt.subplots(1, figsize = (12,3))
cmsa_df_clean_fill.tail(1000).plot(ax=ax)

In [None]:
plot_loc = cmsa_df_clean.columns[0]
print(plot_loc)
fig, ax = plt.subplots(1, figsize = (12,3))
cmsa_df_clean_fill.tail(1000).plot(y = plot_loc, ax = ax, color = 'green', label = 'csv')
cmsa_df_clean.tail(1000).plot(y = plot_loc, ax = ax, color = 'orange', label = 'database')

### 4. Create model dataframes

In [None]:
df_y_train = cmsa_df_clean_fill.copy()

In [None]:
lag_df = df_y_train.shift(my_shift, freq = "infer").add_prefix('lag_')  # create lagged features
lag_df2 = df_y_train.shift(2*my_shift, freq = "infer").add_prefix('lag2_')

In [None]:
df_y_predict = h.get_future_df(start_prediction, predict_days)

In [None]:
df_X_train, x_cols_final = h.get_variables_df(df_y_train, covid_df, holiday_df, vacation_df, 
                                              knmi_df, x_cols, lag_df, lag_df2)

In [None]:
df_X_predict, x_cols_final = h.get_variables_df(df_y_predict, covid_df, holiday_df, vacation_df, 
                                                knmi_df, x_cols, lag_df, lag_df2)

In [None]:
df_X_train = df_X_train[~df_y_train.isna().any(axis = 1)] # remove incomplete data
df_y_train = df_y_train[~df_y_train.isna().any(axis = 1)] 

### 5. Model

In [None]:
print('Start modelling')

In [None]:
# Run linear regression model
#for location in my_locations:
#    df_y_predict = h.train_predict_model_lm(df_y_train, df_X_train, df_y_predict, df_X_predict, location)

##### Current model of choice

In [None]:
# Run XGBoost model
for location in my_locations:
    df_y_predict = h.train_predict_model_xg(df_y_train, df_X_train, df_y_predict, df_X_predict, location)

### 6. Prepare output

In [None]:
print('Start preparing data')

In [None]:
final_df = h.prepare_final_dataframe(df_y_predict)

In [None]:
final_df_api = h.create_api_output(final_df, cmsa_df_raw, start_prediction, predict_days)

In [None]:
final_df_store = h.add_versions_to_store(final_df_api, current_model_version, current_data_version)

### 7. Store data

In [None]:
#print('Start storing data')

In [None]:
#final_df_api.to_sql('ingested.cmsa_predictions', con = engine, if_exists = 'replace', index = False) 

In [None]:
#final_df_store.to_sql('ingested.cmsa_prediction_store', con = engine, if_exists = 'append', index = False)  

In [None]:
#print('Finished storing data')

### 8. Check prediction

In [None]:
final_df_store.head(2)

In [None]:
final_df_store.tail(2)

In [None]:
# Prepare dataframe
df_plot_pred = final_df_api.pivot_table(index = ["datetime"], columns = "location_id", values = "total_count_predict")
df_plot_actual = df_y_train.tail(21*96)
df_plot = pd.merge(df_plot_actual, df_plot_pred, 
                   left_index = True, right_index = True, how = 'outer', suffixes = ["", "_prediction"])

In [None]:
# Get thresholds
df_static = pd.read_sql_query("SELECT * FROM ingested.static_cmsa_sensor limit 3000", con = engine)

#### First impression

In [None]:
# Plot 3 weeks realized + 1 week prediction  
for location in my_locations:    
    fig, ax = plt.subplots(1, figsize = (12,4))
    df_plot.plot(y=location, ax = ax, color = 'black')
    df_plot.plot(y=location + '_prediction', ax = ax, color = 'grey')
    thresh_low = df_static['crowd_threshold_low'][df_static['objectnummer'] == location].item() 
    thresh_high = df_static['crowd_threshold_high'][df_static['objectnummer'] == location].item() 
    plt.axhspan(0, thresh_low, color= 'green', alpha = 0.1)
    plt.axhspan(thresh_low+1, thresh_high, color= 'yellow', alpha = 0.1)
    plt.axhspan(thresh_high+1, thresh_high+200, color= 'red', alpha = 0.1)
    plt.legend(bbox_to_anchor=(1,1))

### 9. Create input prediction report

#### Get report graphs

In [None]:
# Prepare complete dataframe
df_plot_wk = df_plot_pred.reset_index()

df_plot_ac_wk = df_y_train[(df_y_train.index >= start_prediction - timedelta(days = 21)) & (df_y_train.index < start_prediction)]
df_plot_ac_wk = df_plot_ac_wk.reset_index()

df_plot_all = pd.concat([df_plot_ac_wk, df_plot_wk], axis = 0).reset_index(drop=True)

In [None]:
# Add datetime indications 
df_plot_all['week'] = df_plot_all['datetime'].dt.isocalendar().week
df_plot_all['weekday'] = df_plot_all['datetime'].dt.weekday

In [None]:
# Create separate dataframes for week and weekend
df_plot_dw = df_plot_all[df_plot_all['weekday'] < 5].reset_index(drop=True) # Monday-Friday
df_plot_we = df_plot_all[df_plot_all['weekday'] > 4].reset_index(drop=True) # Saturday-Sunday

In [None]:
# Get amount per day
df_plot_dw_g = df_plot_dw.groupby('week').sum() / 5  
df_plot_we_g = df_plot_we.groupby('week').sum() / 2 

In [None]:
df_plot_dw_g = df_plot_dw_g.reset_index() 
df_plot_we_g = df_plot_we_g.reset_index() 

In [None]:
# Determine colors in graph
df_plot_dw_g['color'] = 'green'
df_plot_dw_g['color'][df_plot_dw_g['week'] == my_week] = 'lightgreen'
df_plot_we_g['color'] = 'green'
df_plot_we_g['color'][df_plot_we_g['week'] == my_week] = 'lightgreen'

In [None]:
plt.rcParams["axes.axisbelow"] = True
plt.rcParams.update({'axes.titlesize': 14,
                     'axes.labelsize': 14, 'xtick.labelsize': 14, 'ytick.labelsize': 14,
                     'axes.labelpad': 8.0
                    })

In [None]:
save_date = date.today().strftime("%Y%m%d") + "_"

In [None]:
for col in my_locations: 
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (11,4), dpi = 100, frameon = False, sharey = True, constrained_layout = True)

    fig.suptitle(col, fontsize = 18)
    
    ax1.bar(x = df_plot_dw_g['week'], height = df_plot_dw_g[col], color = df_plot_dw_g['color'])
    ax1.set_title('Doordeweeks (ma-vr)')
    ax1.set_xlabel("Week")
    ax1.set_ylabel("Aantal CMSA counts per dag")
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.grid(axis = 'y', color = 'lightgrey')

    ax2.bar(x = df_plot_we_g['week'], height = df_plot_we_g[col], color = df_plot_we_g['color'])
    ax2.set_title('Weekend (za-zo)')
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.grid(axis = 'y', color = 'lightgrey')
    ax2.set_xlabel("Week")
    
    gemeten = mpatches.Patch(color='green', label='Gemeten')
    voorspelling = mpatches.Patch(color='lightgreen', label='Voorspelling')
    plt.legend(handles=[gemeten, voorspelling], bbox_to_anchor=(1.2, 0.5, 0.5, 0.5))

    my_filename = "output/weekly_report_csma_prediction_" + save_date + col + ".png" 
        
    plt.savefig(my_filename, bbox_inches='tight')
    plt.show()

In [None]:
# Store (summed) prediction data for evaluation
df_plot_dw_g.to_csv('output/prediction_dw_week' + str(my_week) + '.csv')
df_plot_we_g.to_csv('output/prediction_we_week' + str(my_week) + '.csv')
df_y_predict.to_csv('output/prediction_all_week' + str(my_week) + '.csv')

#### Get report max ( & thresholds)

In [None]:
# Add linear regression model for extra info on peaks
for location in my_locations:
    df_y_predict = h.train_predict_model_lm(df_y_train, df_X_train, df_y_predict, df_X_predict, location)

In [None]:
#df_y_predict.max()

In [None]:
#df_y_predict.idxmax()

##### stadhouderskade

In [None]:
loc = my_locations[0]

In [None]:
print(df_static[['crowd_threshold_low', 'crowd_threshold_high']][df_static['objectnummer'] == loc])
pd.concat([df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_lm_' + loc) ,
          df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_xg_' + loc)])

##### albert cuyp

In [None]:
loc = my_locations[1]

In [None]:
print(df_static[['crowd_threshold_low', 'crowd_threshold_high']][df_static['objectnummer'] == loc])
pd.concat([df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_lm_' + loc) ,
          df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_xg_' + loc)])  

##### Kalverstraat

In [None]:
loc = my_locations[2]

In [None]:
print(df_static[['crowd_threshold_low', 'crowd_threshold_high']][df_static['objectnummer'] == loc])
pd.concat([df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_lm_' + loc) ,
          df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_xg_' + loc)])    

##### Nieuwendijk

In [None]:
loc = my_locations[3]

In [None]:
print(df_static[['crowd_threshold_low', 'crowd_threshold_high']][df_static['objectnummer'] == loc])
pd.concat([df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_lm_' + loc) ,
          df_y_predict[['predict_xg_' + loc, 'predict_lm_' + loc]].nlargest(2, columns = 'predict_xg_' + loc)])

#### Report text

In [None]:
print(x_cols_final)

Meegenomen in model:
- Historische CMSA data
- Covid-19 maatregelen
- Feestdagen & vakanties
- Weer

Verwachte piek Vondelpark (ingang Stadhouderskade): woensdag 18/8 aan het einde van de middag. Drukteniveau acceptabel (groen), richting druk (oranje).

Verwachte piek Albert Cuyp: zaterdag 21/8 in het midden van de middag. Drukteniveau acceptabel (groen).

Verwachte piek Kalverstraat (t.h.v 1): zaterdag 21/8 in het midden van de middag. Drukteniveau te druk (rood).

Verwachte piek Nieuwendijk: zaterdag 21/8 in het midden van de middag. Drukteniveau druk (oranje), bijna te druk (rood).


#### Look at performance previous prediction

In [None]:
# Get(summed) prediction data for evaluation
df_prev_dw = pd.read_csv('output/prediction_dw_week' + str(my_week - 1) + '.csv')
df_prev_we = pd.read_csv('output/prediction_we_week' + str(my_week - 1) + '.csv')
df_prev = pd.read_csv('output/prediction_all_week' + str(my_week - 1) + '.csv')

In [None]:
df_prev_dw.tail(1)

In [None]:
df_prev_we.tail(1)

In [None]:
df_actual = df_y_train
df_actual['week'] = df_actual.index.isocalendar().week
df_actual = df_actual[df_actual['week'] == my_week - 1]
df_actual = df_actual.drop(['week'], axis = 1)

In [None]:
df_static[['objectnummer', 'crowd_threshold_low', 'crowd_threshold_high',]][df_static['objectnummer'].isin(my_locations)]

In [None]:
df_actual.max()

In [None]:
df_actual.idxmax(axis = 0)

### 10. Backtesting 

#### Prepare to run model - one week

In [None]:
start_learnset

In [None]:
# Look for all Mondays in the dataset
weeks = h.find_start_of_weeks(df_y_train)
last_week = weeks[-7]
print(last_week)

In [None]:
# Create a training set consisting all the data until that week and a testset containing only that week    
df_y_train_bt, df_y_test = h.create_train_and_test_set(df_y_train, last_week, start_learnset)
df_X_train_bt, df_X_test = h.create_train_and_test_set(df_X_train, last_week, start_learnset)

#### Train & evaluate models - one week

In [None]:
# Lag
df_y_test = pd.merge(df_y_test, lag_df, left_index = True, right_index = True, how = 'left')

In [None]:
# Linear regression
for location in my_locations:
    df_y_test = h.train_predict_model_lm(df_y_train_bt, df_X_train_bt, df_y_test, df_X_test, location)

In [None]:
# XGBoost
for location in my_locations:
    df_y_test = h.train_predict_model_xg(df_y_train_bt, df_X_train_bt, df_y_test, df_X_test, location)

#### Show results - one week

In [None]:
for location in my_locations:  
    df_y_test.plot(y = [location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location]) 

In [None]:
#df_y_test.plot(y = [location, 'lag_' + location]) 

#### Importance plot XGB - one week

In [None]:
#df_y_train_loc = df_y_train[['GAVM-02-Stadhouderskade']]
#model = h.train_model_xg(df_X_train, df_y_train_loc)
#xgb.plot_importance(model)

In [None]:
#df_y_train_loc = df_y_train[['GAAM-01-AlbertCuypstraat']]
#model = h.train_model_xg(df_X_train, df_y_train_loc)
#xgb.plot_importance(model)

In [None]:
#df_y_train_loc = df_y_train[['CMSA-GAKH-01']]
#model = h.train_model_xg(df_X_train, df_y_train_loc)
#xgb.plot_importance(model)

In [None]:
#df_y_train_loc = df_y_train[['GACM-02']]
#model = h.train_model_xg(df_X_train, df_y_train_loc)
#xgb.plot_importance(model)

#### Run models and evaluate - multiple weeks

In [None]:
#importlib.reload(h)

In [None]:
#start_learnset = pd.to_datetime('2021-05-01 00:00:00').tz_localize("Europe/Amsterdam")

In [None]:
#df_y_train = df_y_train[df_y_train.index < '2021-07-08 00:00:00+02:00']

In [None]:
pred_lag = h.evaluate_n_last_weeks_lag(7, df_y_train, lag_df, start_learnset, my_locations)

In [None]:
pred_lm = h.train_evaluate_n_last_weeks_lm(7, df_y_train, df_X_train, start_learnset, my_locations)

In [None]:
pred_xg = h.train_evaluate_n_last_weeks_xg(7, df_y_train, df_X_train, start_learnset, my_locations) 

#### Visualize results - multiple weeks

In [None]:
# Prepare dataframe with all results
preds = pd.merge(pred_lm, pred_xg, left_index = True, right_index = True, suffixes = ("", "_2"))
preds = pd.merge(preds, pred_lag, left_index = True, right_index = True, suffixes = ("", "_3"))

In [None]:
# Add datetime indications
preds['date'] = preds.index.date
preds['hour'] = preds.index.hour
preds['weekday'] = preds.index.weekday
preds['week'] = preds.index.isocalendar().week

In [None]:
# Create aggregates
pred_d = preds.groupby('date').sum()
pred_h = preds.groupby('hour').mean()
pred_dh = preds.groupby(['weekday', 'hour']).mean()
pred_w = preds.groupby('week').mean()
pred_wd = preds.groupby('weekday').mean()

##### error per day - total

In [None]:
stats = []
for location in my_locations:
    stats.append(h.evaluate(pred_d['lag_' + location], pred_d[location], print_metrics=False))
rmse, rmse_busy = zip(*stats)
print(f"(Mean/std) Root mean squared error: {np.mean(rmse).round()}/{np.std(rmse).round()}")
print(f"(Mean/std) Root mean squared error (crowded): {np.mean(rmse_busy).round()}/{np.std(rmse_busy).round()}")

In [None]:
stats = []
for location in my_locations:
    stats.append(h.evaluate(pred_d['predict_lm_' + location], pred_d[location], print_metrics=False))
    
rmse, rmse_busy = zip(*stats)
print(f"(Mean/std) Root mean squared error: {np.mean(rmse).round()}/{np.std(rmse).round()}")
print(f"(Mean/std) Root mean squared error (crowded): {np.mean(rmse_busy).round()}/{np.std(rmse_busy).round()}")

In [None]:
stats = []
for location in my_locations:
    stats.append(h.evaluate(pred_d['predict_xg_' + location], pred_d[location], print_metrics=False))
    
rmse, rmse_busy = zip(*stats)
print(f"(Mean/std) Root mean squared error: {np.mean(rmse).round()}/{np.std(rmse).round()}")
print(f"(Mean/std) Root mean squared error (crowded): {np.mean(rmse_busy).round()}/{np.std(rmse_busy).round()}")

##### error per day - per location

In [None]:
for location in my_locations:
    print(location)
    h.evaluate(pred_d['lag_' + location], pred_d[location], print_metrics=True)

In [None]:
for location in my_locations:
    print(location)
    h.evaluate(pred_d['predict_lm_' + location], pred_d[location], print_metrics=True)

In [None]:
for location in my_locations:
    print(location)
    h.evaluate(pred_d['predict_xg_' + location], pred_d[location], print_metrics=True)

In [None]:
for location in my_locations:
    pred_d.plot(y = [location, 'predict_lm_' + location, 'predict_xg_' + location, 'lag_' + location])
    plt.legend(bbox_to_anchor=(1.05,1))

##### per day of the week

In [None]:
for location in my_locations:
    pred_wd.plot(y = [location, 'predict_lm_' + location, 'predict_xg_' + location, 'lag_' + location])
    plt.legend(bbox_to_anchor=(1.05,1))

##### total per week

In [None]:
for location in my_locations:
    pred_w.plot.bar(y = [location, 'predict_lm_' + location, 'predict_xg_' + location, 'lag_' + location], rot = 0)
    plt.legend(bbox_to_anchor=(1.05,1))

##### one specific week

In [None]:
my_week = 24

In [None]:
for location in my_locations:
    preds[preds['week'] == my_week].plot(y = [location, 'predict_lm_' + location, 'predict_xg_' + location])
    plt.legend(bbox_to_anchor=(1.05,1))

##### total per hour of the week

In [None]:
for location in my_locations:
    pred_dh.plot(y = [location, 'predict_lm_' + location, 'predict_xg_' + location])
    plt.legend(bbox_to_anchor=(1.05,1))

##### total per hour of the day

In [None]:
for location in my_locations:
    pred_h.plot(y = [location, 'predict_lm_' + location, 'predict_xg_' + location])
    plt.legend(bbox_to_anchor=(1.05,1))

#### Max values - multiple weeks

##### max values stadhouderskade

In [None]:
location = 'GAVM-02-Stadhouderskade'

In [None]:
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].max()

##### max value moments stadhouderskade

In [None]:
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].idxmax()

##### max values albert cuyp

In [None]:
location = 'GAAM-01-AlbertCuypstraat'

In [None]:
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].max()

##### max value moments albert cuyp

In [None]:
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].idxmax()

In [None]:
##### max values kalverstraat
location = 'CMSA-GAKH-01'
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].max()

In [None]:
##### max values moments kalverstraat
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].idxmax()

In [None]:
##### max values nieuwendijk
location = 'GACM-02'
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].max()

In [None]:
##### max values moments niewendijk
preds.groupby(['week'])[location, 'lag_' + location, 'predict_lm_' + location, 'predict_xg_' + location].idxmax()