In [None]:
# ^^^ pyforest auto-imports - don't write above this line


# General Modelling Framework

Important: change directories to folder that contains credentials/helper functions.

### Preparations

In [None]:
def install_packages():
    # (Re-)Installs packages.
    
    get_ipython().run_cell_magic('bash', '', 'pip install keras\npip install tensorflow\npip install imblearn\npip install mord\npip install psycopg2-binary\npip install workalendar\npip install eli5\n pip install plotly')
    
    pd.set_option('mode.chained_assignment', None)

In [None]:
%%capture
install_packages()

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os

from sqlalchemy import create_engine, inspect

os.chdir("/home/jovyan/Crowd-prediction/Credentials")
import env

from pyspark.sql import SparkSession
from pyspark.sql.functions import substring, length, col, expr
from pyspark.sql.types import *

import requests

from datetime import datetime, timedelta, date
import pytz
from workalendar.europe import Netherlands

from sklearn.metrics import mean_squared_error, mean_absolute_error

os.chdir("/home/jovyan/Crowd-prediction/Operational_code")
import helpers as h

import importlib   # to reload helpers without restarting kernel: importlib.reload(h)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#importlib.reload(h)

### Settings

#### Database connection

In [None]:
engine_azure, table_names = h.prepare_engine(env)

#### Arguments for functions

In [None]:
# frequency of sampling for data source to predict
freq = '15min'

In [None]:
# what period to predict for operational forecast (samples)
predict_period = 8
# how many samples in a day
n_samples_day = 96
# how many samples in a week
n_samples_week = n_samples_day*7

In [None]:
# column name(s) of variable to predict (can also be "all")
Y_name = "Vondelpark Oost" 

# data source (for which the predictions are made)
data_source = 'resono'

# type of prediction (count -> regression or level -> classification)
target = 'count'

In [None]:
# location names of CMSA and parking data
cmsa_location = 'Stadhouderskade' 
cmsa_location_new = 'Stadhouderskade_cmsa' # To fix possible overlap resono and cmsa location names
parking_location = 'CE-P06 Byzantium' 

In [None]:
# inputs for models
batch_size = 5
epochs = 10
neurons = 10
drop_out_perc = 0.2
tune_hyperparameters = False
use_sample_weights = False
use_smote = False

In [None]:
# input for starting of learnset 
start_learnset = h.get_start_learnset(train_length = 16, date_str = None)

In [None]:
# perform outlier removal ("yes" or "no")
outlier_removal = "no"

In [None]:
# set versions (for storing results)
current_model_version = 'lr_0_0'
current_data_version = "1_0" 

In [None]:
# define vacations (we might want to put this in a database table in the future)
kerst_19 = pd.DataFrame(data = {'date': pd.date_range(date(2019, 12, 21), periods = 7*2 + 2, freq='1d')})
voorjaar_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 2, 15), periods = 9, freq='1d')})
mei_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 4, 25), periods = 9, freq='1d')})
zomer_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 7, 4), periods = 7*6 + 2, freq='1d')})
herfst_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 10, 10), periods = 9, freq='1d')})
kerst_20 = pd.DataFrame(data = {'date': pd.date_range(date(2020, 12, 19), periods = 7*2 + 2, freq='1d')})
voorjaar_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 2, 20), periods = 9, freq='1d')})
mei_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 5, 1), periods = 9, freq='1d')})
zomer_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 7, 10), periods = 7*6 + 2, freq='1d')})
herfst_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 10, 16), periods = 9, freq='1d')})
kerst_21 = pd.DataFrame(data = {'date': pd.date_range(date(2021, 12, 25), periods = 7*2 + 2, freq='1d')})

## Main

### 1. Get data

In [None]:
print('Start loading raw data') 

In [None]:
resono_df_raw = h.get_data(engine_azure, "ingested.resono", Y_name, start_learnset)

In [None]:
# select moment to end operational forecast based on last value of variable to predict in database

# Resono example
start_prediction = pd.date_range(resono_df_raw["measured_at"].max(), periods = 2, freq = freq)[1]
end_prediction = pd.date_range(start_prediction, periods = predict_period, freq = freq)
end_prediction = end_prediction[len(end_prediction)-1]

In [None]:
cmsa_df_raw = h.get_data(engine_azure, "ingested.cmsa", cmsa_location, start_learnset)

In [None]:
parking_df_raw = h.get_data(engine_azure, "ingested.parking", parking_location, start_learnset)

In [None]:
holidays_data_raw = Netherlands().holidays(2020) + Netherlands().holidays(2021) 

In [None]:
vacation_df_raw = kerst_19.append([voorjaar_20, mei_20, zomer_20, herfst_20, kerst_20,
                                   voorjaar_21, mei_21, zomer_21, herfst_21, kerst_21])

In [None]:
covid_url = 'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/'+ str(start_learnset) + "/" + str(start_prediction)
covid_df_raw = pd.DataFrame(requests.get(url = covid_url).json()['data'])

### 2. Prepare data

In [None]:
print('Start pre-processing data')

In [None]:
resono_df = h.preprocess_resono_data(resono_df_raw, freq, end_prediction)

In [None]:
cmsa_df = h.preprocess_cmsa_data(cmsa_df_raw, freq, end_prediction)

In [None]:
parking_df = h.preprocess_parking_data(parking_df_raw, freq, end_prediction, remove_ceiling = True)

In [None]:
covid_df = h.preprocess_covid_data(covid_df_raw, freq, end_prediction)

In [None]:
holiday_df = h.preprocess_holidays_data(holidays_data_raw, freq, end_prediction)

In [None]:
vacation_df = h.preprocess_vacation_data(vacation_df_raw, freq, end_prediction)

In [None]:
# pre-process any other data sources here


In [None]:
# Join data
df = resono_df.join(cmsa_df).join(parking_df).join(covid_df).join(holiday_df).join(vacation_df)

### 3. Clean data

In [None]:
print('Start cleaning data')

In [None]:
# Impute/drop missing data and substitute outliers
cols_to_clean = [parking_location] + [cmsa_location_new]
df = h.clean_data(df, target, Y_name, n_samples_day, cols_to_clean = cols_to_clean, outlier_removal = outlier_removal)

In [None]:
# Add time features
df = h.add_time_variables(df)

In [None]:
# Create new features from the data
df = h.add_lag_variables(df, Y_name, target, predict_period, n_samples_day, n_samples_week)

In [None]:
# filter data based on start learnset
df = df[start_learnset:]

In [None]:
# drop time slots for which missing values remain
df = df.dropna()

In [None]:
# scale dataset
df, y_scaler = h.scale_variables(df, Y_name, target, method = "standard")

### 4. Create model dataframes

In [None]:
df_X_train, df_y_train = h.get_train_df(df, Y_name, start_prediction) 

In [None]:
df_y_predict = h.get_future_df(start_prediction, predict_period, freq)

In [None]:
df_X_predict = df.drop(Y_name, 1)

# For variables that are not known yet, use historical values
for col in cols_to_clean:
    df_X_predict[col] = df_X_predict[col].shift(n_samples_week)

# Select features for prediction period
df_X_predict = df_X_predict[start_prediction:end_prediction]

### 5. Create operational prediction

In [None]:
print('Start modelling')

In [None]:
model = h.train_model_linear_regression(df_X_train, df_y_train)

In [None]:
df_y_predict[Y_name] = h.test_model_linear_regression(model, df_X_predict)

In [None]:
# unscale prediction
if target == "count":
    df_y_predict = h.unscale_y(df_y_predict, y_scaler)

### 6. Prepare output

In [None]:
print('Start preparing data')

In [None]:
final_df = h.prepare_final_dataframe(df_y_predict, resono_df_raw, data_source, target, current_model_version, current_data_version)

### 7. Store data

In [None]:
print('Start storing data')

In [None]:
# Resono example

#final_df.to_sql('resono_predictions_test', con = engine_azure, if_exists = 'append', index = False)

In [None]:
print('Finished storing data')

### 8. Check operational prediction

In [None]:
final_df.head()

### 9. Backtesting 

#### Prepare training and test data frames

In [None]:
# Input for backtesting

# Start testing from this timestamp until the most recent time slot
start_test = "2021-04-01 00:00:00"
# What period to predict for backtesting (samples)
predict_period = 96*30

# inputs for models
prediction_window = 8
batch_size = 2
epochs = 10
neurons = 20
drop_out_perc = 0.2
learning_rate = 0.01
tune_hyperparameters = False
use_smote = True
use_sample_weights = False

# columns for which we need future values
cols_unknown = [parking_location] + [cmsa_location_new]

In [None]:
# Prepare data
df_y_predict_bt, df_y_train_bt, df_y_ground_truth_bt, df_y_ground_truth_bt_scaled, df_X_train_bt, df_X_predict_bt = h.prepare_backtesting(start_test, predict_period, freq, 
                                                                                   df, Y_name, cols_unknown, 
                                                                                   n_samples_week, target, y_scaler)

#### Train, predict and evaluate benchmark model

In [None]:
# Benchmark predictions
df_y_benchmark = df_y_predict_bt.copy()
df_y_benchmark[Y_name] = h.test_model_avg_3_weeks_bt(df_y_train_bt, df_y_predict_bt, df_y_ground_truth_bt_scaled, predict_period, 
                                                   n_samples_week, target)
if target == "count":
    df_y_benchmark = h.unscale_y(df_y_benchmark, y_scaler)
error_metrics_benchmark = h.evaluate(df_y_benchmark, df_y_ground_truth_bt, target, print_metrics = True)

#### Train, predict and evaluate models

In [None]:
# Model predictions
df_y_model = df_y_predict_bt.copy()
model = h.train_model_linear_regression(df_X_train_bt, df_y_train_bt)
df_y_model[Y_name] = h.test_model_linear_regression(model, df_X_predict_bt)
if target == "count":
    df_y_model = h.unscale_y(df_y_model, y_scaler)
error_metrics_model = h.evaluate(df_y_model, df_y_ground_truth_bt, target, print_metrics = True)

In [None]:
# Visualize backtesting result
fig = h.visualize_backtesting(df_y_ground_truth_bt, df_y_benchmark, df_y_model, target, Y_name, error_metrics_model)
fig

#### Tune (hyper-)parameters

#### Feature importance

In [None]:
feature_importance, fig = h.feature_importance(model.coef_[0], list(df_X_train_bt.columns))
fig