# Create multi-model ensembles (MME)
1. Equally weighted ensemble (same weight to each quantile).
2. MME using the degenerate EM algorithm.

In [15]:
from matplotlib.dates import date2num, num2date
from matplotlib.colors import ListedColormap
from matplotlib import dates as mdates
from matplotlib.patches import Patch
from matplotlib import pyplot as plt
from matplotlib import ticker

import pandas as pd
import numpy as np
import itertools
import sys
import re
import os

sys.path.insert(0, "..")

from global_config import config

results_dir   = config.get_property('results_dir')
data_dir      = config.get_property('data_dir')

def create_df_ensemble(weights_df, forecast_df_list, name_models):
    e_df = [forecast_df_list[idx]* weights_df.loc[name_models[idx]]["weigth"] for idx in range(len(forecast_df_list))]
    e_df = sum(e_df)
    return e_df

In [16]:
path_to_frcst_1 = os.path.join(results_dir, "forecast", "arima")
path_to_frcst_2 = os.path.join(results_dir, "forecast", "eakf_model1")
path_to_frcst_3 = os.path.join(results_dir, "forecast", "eakf_model2")
path_to_frcst_4 = os.path.join(results_dir, "forecast", "eakf_model3")

data_df         = pd.read_csv(os.path.join(data_dir, "processed_data_us.csv"), parse_dates=["date"])
dates_forecasts = data_df.date[11:].values


In [17]:
from utils.utils_frcst import degenerate_em_weights


## Equally weighted ensemble

In [18]:

for idx_date, date_use in enumerate(dates_forecasts):

    past_scores  = -1 # window size, not used here.
    #date_use_idx = 6

    prev_dates  = pd.to_datetime(dates_forecasts[:idx_date])
    date_str    = pd.to_datetime(date_use).strftime("%Y-%m-%d")

    frcst1_df   = pd.read_csv(os.path.join(path_to_frcst_1, f"{date_str}.csv"), parse_dates=["date", "forecast_date"]).drop(columns=["forecast_date"]).set_index(["date"]).iloc[:6]
    frcst2_df   = pd.read_csv(os.path.join(path_to_frcst_2, f"{date_str}.csv"), parse_dates=["date", "forecast_date"]).drop(columns=["Unnamed: 0", "type", "std", "forecast_date"]).set_index(["date"]).iloc[:6]
    frcst3_df   = pd.read_csv(os.path.join(path_to_frcst_3, f"{date_str}.csv"), parse_dates=["date", "forecast_date"]).drop(columns=["Unnamed: 0", "type", "std", "forecast_date"]).set_index(["date"]).iloc[:6]
    frcst4_df   = pd.read_csv(os.path.join(path_to_frcst_4, f"{date_str}.csv"), parse_dates=["date", "forecast_date"]).drop(columns=["Unnamed: 0", "type", "std", "forecast_date"]).set_index(["date"]).iloc[:6]

    f_list       = [frcst1_df, frcst2_df, frcst3_df, frcst4_df]
    name_models  = ['arima', 'eakf_model1', 'eakf_model2', "eakf_model3"]
    w_df         = degenerate_em_weights(np.array([[1, 1, 1, 1]])/4, models_name=name_models)
    ens_df       = create_df_ensemble(w_df, f_list, name_models)

    path_to_save_ens = os.path.join(results_dir, "forecast", "ensemble", "equal_weights")
    ens_df.to_csv((os.path.join(path_to_save_ens,  f"{date_str}.csv")))


## MME using the EM algorithm

In [19]:
forecasts_df = pd.read_csv(os.path.join(results_dir, "forecast", "evaluation", "forecasts.csv"), parse_dates=["forecast_date"]).dropna(axis=1)
evals_df     = pd.read_csv(os.path.join(results_dir, "forecast", "evaluation", "scores.csv"), parse_dates=["frsct_date"])

### 1. Use all past performance data to train the model

In [20]:
forecast_windows = {"1m": 1, "2m": 2, "3m": 3, "4m": 4, "5m": 5, "6m": 6}
scores    = ["wis"]
dates_use = dates_forecasts[6:]

for idx_date_use, date_use in enumerate(dates_use):
    for fw in list(list(forecast_windows.keys())):

        fw_int = int(fw[0])-1

        past_scores  = -1 # window size, not used here.
        date_use_idx = 6+idx_date_use

        prev_dates  = pd.to_datetime(dates_forecasts[:date_use_idx-fw_int])
        date_str    = pd.to_datetime(date_use).strftime("%Y-%m-%d")

        f_date_df      = forecasts_df[forecasts_df["forecast_date"] == date_str]
        forecast1_df   = f_date_df[f_date_df.model == 'arima'].iloc[:6].set_index(["date"]).drop(columns=["add", "model" , "forecast_date"])
        forecast2_df   = f_date_df[f_date_df.model == 'model1'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])
        forecast3_df   = f_date_df[f_date_df.model == 'model2'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])
        forecast4_df   = f_date_df[f_date_df.model == 'model3'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])

        # put forecasts in a list
        f_list       = [forecast1_df, forecast2_df, forecast3_df, forecast4_df]

        evals_use_df = evals_df[evals_df.frsct_date.isin(prev_dates)]
        evals_use_df = evals_use_df[evals_use_df.eval_horizon==fw]

        eval_ma_df   = pd.pivot(evals_use_df, index="frsct_date", columns="method", values="wis")
        name_models  = list(eval_ma_df.keys())

        normalize_score = 1 - eval_ma_df.to_numpy()/np.linalg.norm(eval_ma_df.to_numpy(), axis=1, keepdims=True)

        w_df            = degenerate_em_weights(normalize_score, models_name=name_models)
        ens_df          = create_df_ensemble(w_df, f_list, name_models)

        path_to_save_ens = os.path.join(results_dir, "forecast", "ensemble", "all_past", f"wis", fw)
        path_to_save_w   = os.path.join(pasth_to_save_ens, "weights")

        os.makedirs(path_to_save_ens, exist_ok=True)
        os.makedirs(path_to_save_w, exist_ok=True)

        w_df.to_csv(os.path.join(path_to_save_w,  f"{date_str}.csv"))
        ens_df.to_csv((os.path.join(path_to_save_ens,  f"{date_str}.csv")))


In [21]:
forecast_windows = {"1m": 1, "2m": 2, "3m": 3, "4m": 4, "5m": 5, "6m": 6}
scores = ["wis"]
dates_use = dates_forecasts[6:]

weights_df = []
for idx_date_use, date_use in enumerate(dates_use):
    date_str    = pd.to_datetime(date_use).strftime("%Y-%m-%d")
    for fw in list(list(forecast_windows.keys())):

        path_to_save_ens = os.path.join(results_dir, "forecast", "ensemble", "all_past", f"wis", fw)
        path_to_save_w   = os.path.join(path_to_save_ens, "weights")

        w_df                   = pd.read_csv(os.path.join(path_to_save_w,  f"{date_str}.csv"))
        w_df["date_forecast"] = date_str
        w_df["forecast_window"] = fw
        weights_df.append(w_df)

weights_df = pd.concat(weights_df)
weights_df.to_csv(os.path.join(results_dir, "forecast", "evaluation", "weights_AllPast.csv"))

## 2. Use k-months past performance data to train the model
We used k from 2 to 6 months.

In [22]:
forecast_windows = {"1m": 1, "2m": 2, "3m": 3, "4m": 4, "5m": 5, "6m": 6}
score_use_to_ensemble = "wis"

dates_use       = dates_forecasts[6:]
past_points_use = [2, 3, 4, 5, 6]

for K in past_points_use:
    for idx_date_use, date_use in enumerate(dates_use):
        for fw in list(list(forecast_windows.keys())):

            fw_int = int(fw[0])-1

            past_scores  = -1 # window size, not used here.
            date_use_idx = 6+idx_date_use
            date_str    = pd.to_datetime(date_use).strftime("%Y-%m-%d")

            prev_dates  = pd.to_datetime(dates_forecasts[date_use_idx-fw_int-K:date_use_idx-fw_int])
            if len(prev_dates)==0:
                continue

            #print(prev_dates)
            f_date_df      = forecasts_df[forecasts_df["forecast_date"] == date_str]
            forecast1_df   = f_date_df[f_date_df.model == 'arima'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])
            forecast2_df   = f_date_df[f_date_df.model == 'model1'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])
            forecast3_df   = f_date_df[f_date_df.model == 'model2'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])
            forecast4_df   = f_date_df[f_date_df.model == 'model3'].iloc[:6].set_index(["date"]).drop(columns=["add", "model", "forecast_date"])

            f_list       = [forecast1_df, forecast2_df, forecast3_df, forecast4_df]

            evals_use_df = evals_df[evals_df.frsct_date.isin(pd.to_datetime(prev_dates))]
            evals_use_df = evals_use_df[evals_use_df.eval_horizon==fw]

            if len(evals_use_df)==0:
                continue

            eval_ma_df      = pd.pivot(evals_use_df, index="frsct_date", columns="method", values=score_use_to_ensemble)
            name_models     = list(eval_ma_df.keys())

            normalize_score = 1 - eval_ma_df.to_numpy()/np.linalg.norm(eval_ma_df.to_numpy(), axis=1, keepdims=True)
            w_df            = degenerate_em_weights(normalize_score, models_name=name_models)
            ens_df          = create_df_ensemble(w_df, f_list, name_models)

            ##### - ##### - ##### - ##### - ##### - ##### - #####
            path_to_save_ens = os.path.join(results_dir, "forecast", "ensemble", f"{K}_months_past", fw)
            path_to_save_w   = os.path.join(path_to_save_ens, "weights")
            ##### - ##### - ##### - ##### - ##### - ##### - #####

            os.makedirs(path_to_save_ens, exist_ok=True)
            os.makedirs(path_to_save_w, exist_ok=True)

            w_df.to_csv(os.path.join(path_to_save_w,  f"{date_str}.csv"))
            ens_df.to_csv((os.path.join(path_to_save_ens,  f"{date_str}.csv")))
