# Resono 2h predictions 

Make 2h-ahead predictions of the visitor counts ('total_count' column in the 'ingested.resono' table) for all or a selection of Resono locations that are included in druktebeeld. 

Predictions are written to a new table **'public.resono_2h_pred_count'** or **'public.resono_2h_pred_level'** (depending on whether the visitor counts or crowd levels are predicted) with the following additional columns: 
- **'total_count_predicted'**/**'crowd_level_predicted'**: predicted total counts/crowd levels (for the next 8 time slots per location) 
- **'data_version'**: version of the data (feature set)
- **'model_version'**: version of the model (type and settings)
- **'predicted_at'**: timestamp of prediction (moment prediction was made)

### Preparations

Change directory to folder that contains the DB credentials/folder that contains the function files in code block below.

In [None]:
def install_packages():
    # (Re-)Installs packages.
    
    get_ipython().run_cell_magic('bash', '', 'pip install imblearn\npip install mord\npip install psycopg2-binary\npip install workalendar\npip install eli5\n pip install plotly')
    
    import pandas as pd
    pd.set_option('mode.chained_assignment', None)

In [None]:
%%capture
install_packages()

In [None]:
#pip install scikit-learn==0.24.2  # Run if sklearn error 

In [None]:
import os
import pandas as pd
os.chdir("/home/jovyan/Crowd-prediction/Credentials")
import env_az
os.chdir("/home/jovyan/gitops/central_storage_analyses/notebooks_predictions/resono_2h")
import prediction_model_helpers_bt as h  # Universal predictions
import resono_2h_predictions_bt as resono_pred  # Resono 2h model specific
#import importlib  # For when coding

### Settings

#### Arguments for functions

In [None]:
# frequency of sampling for data source to predict
freq = '15min'

In [None]:
# what period to predict for operational forecast (samples)
predict_period = 8
# how many samples in a day
n_samples_day = 96
# how many samples in a week
n_samples_week = n_samples_day*7

In [None]:
# list of column name(s) of variabe to predict (can also be "all")
#Y_names = "all" 
Y_names = ["Albert Cuyp", "De Dam West", "De Dam Oost", "Kalverstraat Noord", "Kalverstraat Zuid",
          "Vondelpark Oost 1", "Vondelpark Oost 2", "Vondelpark Oost 3", "Vondelpark West",
          "Rembrandtplein", "Leidseplein", "Nieuwmarkt", "Buikslotermeerplein",
          "Rembrandtpark", "Westerpark Centrum", "Westerpark West", "Westerpark Oost",
          "Oosterpark", "Erasmuspark", "Flevopark",
          "Park Frankendael", "Park Somerlust", "Bijlmerplein", "Waterlooplein", "Sarphatipark",
          "Rokin", "Spui", "Damrak", "Nieuwendijk"]

# data source (for which the predictions are made)
data_source = 'resono'

# type of prediction (count -> regression or level -> classification)
target = 'count'

In [None]:
# input for model
use_smote = True

In [None]:
# input for starting of learnset 
start_learnset = h.get_start_learnset(train_length = 8, date_str = None)

In [None]:
# perform outlier removal ("yes" or "no")
outlier_removal = "yes"

In [None]:
# set versions (for storing results)
current_model_version = 'lr_0_0'
current_data_version = "1_0" 

### Get predictions

#### 1. Prepare data sets

In [None]:
base_df, resono_df, resono_df_raw, start_prediction, end_prediction, thresholds, Y_names_all = resono_pred.prepare_data(env_az, 
                                                                                                           freq, 
                                                                                                           predict_period, 
                                                                                                           n_samples_day, 
                                                                                                           Y_names, 
                                                                                                           target,
                                                                                                           start_learnset)

#### 2. Make predictions and store in data frame

In [None]:
# --- remove in version without backtesting
prepared_dfs = dict()
y_scalers = dict()
thresholds_scaled = dict()
# ---

# Initialize data frame with predictions
final_df = pd.DataFrame()

# Predict for each location
for idx, Y in enumerate(Y_names_all):
    
    # Show location
    print(Y)
    
    # Preprocessed data frame for this location
    preprocessed_df = resono_pred.get_location_df(base_df, resono_df, Y)
    
    # Gather predictons for this location
    prepared_df, predictions, y_scaler, thresholds_scaled_one = resono_pred.get_resono_predictions(preprocessed_df, resono_df_raw, freq, predict_period, n_samples_day, 
                                                             n_samples_week, Y, data_source, target, 
                                                             outlier_removal, start_learnset, use_smote,
                                                             current_model_version, current_data_version, 
                                                             start_prediction, end_prediction, thresholds)

    # Add predictions to final data frame
    final_df = pd.concat([final_df, predictions], 0)
    
    # --- remove in version without backtesting
    prepared_dfs[Y] = prepared_df
    y_scalers[Y] = y_scaler
    thresholds_scaled[Y] = thresholds_scaled_one
    # ---

In [None]:
###  Store data

# if target == 'count':
    #final_df.to_sql('resono_2h_pred_count', con = engine_azure, if_exists = 'append', index = False)
# elif target == "level":
    #final_df.to_sql('resono_2h_pred_level', con = engine_azure, if_exists = 'append', index = False)

### Check operational prediction

In [None]:
final_df

### Backtesting --- remove code blocks below in version without backtesting

Test model predictions for the selected location (argument at the beginning) and time period (start_test; within the time period for which the data has been prepared)

**Important**: If you test using dates further back in time you need to enlarge the training set for the operational predictions so that the backtesting set contains of enough data as well.  

In [None]:
# Input for backtesting

# Start testing from this timestamp until the most recent time slot
start_test = "2021-05-01 00:00:00"
# What period to predict for backtesting (samples)
predict_period = 96*31

# inputs for models
use_smote = True

In [None]:
# If using a NN/LSTM model, it is necessary to also install these libraries
# Related functions have to be uncommented in prediction_model_helpers.py
#pip install keras
#pip install tensorflow

In [None]:
# Perform backtesting

# Store results
locations = []
rmse_benchmarks = []
rmse_models = []
figs_pred_time = dict()
figs_conf_mat = dict()
feat_imps = dict()
figs_feat_imp = dict()

# Predict for each location
for idx, Y in enumerate(Y_names_all):
    
    # Show location
    print(Y)
    
    # Prepare data
    df_y_predict_bt, df_y_train_bt, df_y_ground_truth_bt, df_y_ground_truth_bt_scaled, df_X_train_bt, df_X_predict_bt = h.prepare_backtesting(start_test, predict_period, freq, 
                                                                                   prepared_dfs[Y], Y, 
                                                                                   n_samples_week, target, y_scalers[Y])
    
    
    # Do not perform backtesting if there is not enough training data 
    if df_X_train_bt.empty:
        print("Not enough training data: no backtesting performed.")
        continue
    
    # Benchmark predictions
    df_y_benchmark = df_y_predict_bt.copy()
    df_y_benchmark[Y] = h.test_model_avg_3_weeks_bt(df_y_train_bt, df_y_predict_bt, df_y_ground_truth_bt_scaled, predict_period, 
                                                   n_samples_week, target)
    if target == "count":
        df_y_benchmark = h.unscale_y(df_y_benchmark, y_scalers[Y])
        
    error_metrics_benchmark = h.evaluate(df_y_benchmark, df_y_ground_truth_bt, target, count_to_level = True,
                                     Y_name = Y, thresholds = thresholds, print_metrics = False)
    
    rmse_benchmarks.append(error_metrics_benchmark['rmse'])
    
    # Model predictions
    df_y_model = df_y_predict_bt.copy()
    
    model = h.train_model_ridge_regression(df_X_train_bt, df_y_train_bt, Y, target, thresholds_all = thresholds_scaled, use_smote = use_smote)
    df_y_model[Y] = h.test_model_ridge_regression(model, df_X_predict_bt)
    if target == "count":
        df_y_model = h.unscale_y(df_y_model, y_scalers[Y])
    error_metrics_model = h.evaluate(df_y_model, df_y_ground_truth_bt, target, count_to_level = True,
                                 Y_name = Y, thresholds = thresholds, print_metrics = False)
    
    rmse_models.append(error_metrics_model['rmse'])
    
    # Visualize backtesting result
    fig_pred_time, fig_conf_mat = h.visualize_backtesting(df_y_ground_truth_bt, df_y_benchmark, df_y_model, target, Y, 
                                        error_metrics_model, count_to_level = True)
    figs_pred_time[Y] = fig_pred_time
    figs_conf_mat[Y] = fig_conf_mat
    
    # Feature importance
    feat_imp, fig_feat_imp = h.feature_importance(model.coef_[0], list(df_X_train_bt.columns))
    feat_imps[Y] = feat_imp
    figs_feat_imp[Y] = fig_feat_imp
    
    locations.append(Y)

In [None]:
# Backtesting results for all locations
df_results = h.backtesting_results_all_locations(locations, rmse_models, rmse_benchmarks)

In [None]:
# Summarized results
df_results.describe()

In [None]:
# Locations for which the benchmark model performs better
df_results[df_results['RMSE_difference'] > 0]

#### Query results for specific location

In [None]:
df_results[df_results['Location'] == "Albert Cuyp"]

In [None]:
figs_pred_time["Albert Cuyp"]

In [None]:
figs_conf_mat["Albert Cuyp"]

In [None]:
figs_feat_imp["Albert Cuyp"]