## Tree Based Ensemble Models for Price Returns Forecasting

### Approach
1. Set Up
2. Identifying Suitable Lags for Price, Volume & Uncertainty Indices
3. Modelling w/ Grid Search & Forecast Evaluation
   1. Random Forest (h = 1, 4, 12)
      1. Model A (With Price Returns, Price & Volume)
      2. Model B (With A + Lucey Original Price) 
      3. Model C (With A + Lucey Reddit Price)
      4. Model D (With A + LDA Reddit Price)
      5. Model E (With A + Top2Vec Reddit Price)
      6. Model F (With A + VCRIX)
      7. Policy Based Models?

TBD:
1. Use Technical Indicators Common Factors as additional Baseline
2. Add in More Horizon Values
3. Use AIC / BIC 
4. Add in Historical Forecast Plots

### Set Up

In [1]:
# NB config
%load_ext autoreload
%autoreload 2

# Load Libraries
import os
os.chdir("../../")
from typing import Dict, Optional
import pandas as pd
import numpy as np
from pprint import pprint
from pathlib import Path
from datetime import datetime
from darts import TimeSeries
from darts.metrics import (
    mape,
    mse,
)
from tqdm import tqdm
from darts import concatenate
from darts.utils import statistics as dstats
from darts.models.forecasting.random_forest import (
    RandomForest,
)
from darts.models.forecasting.gradient_boosted_model import (
    LightGBMModel,
)
from sklearn.model_selection import ParameterGrid
import warnings
warnings.filterwarnings("ignore")

### Data Preparation

In [2]:
# Data Dir
data_dir = Path("forecasting/data/modelling")

# BTC-USD data
btc_usd_fp = data_dir / "btc_usd_weekly.csv"
btc_usd_df = pd.read_csv(btc_usd_fp)

# UCRY Indices data
ucry_fp = data_dir / "ucry_indices_weekly.csv"
ucry_df = pd.read_csv(ucry_fp)


#### Create ***h***-weeks Log Price Returns Time Series

In [3]:
# h = 1 (Weekly Price Returns)
btc_usd_df["Price Returns (h=1)"] = np.log1p(btc_usd_df[["Price"]].pct_change(1))

# h = 4 (4 Week Price Returns)
btc_usd_df["Price Returns (h=4)"] = np.log1p(btc_usd_df[["Price"]].pct_change(4))

# h = 12 (12 Week Price Returns)
btc_usd_df["Price Returns (h=12)"] = np.log1p(btc_usd_df[["Price"]].pct_change(12))

# Create TimeSeries
# h = 1 (Weekly Price Returns)
btc_usd1_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Price Returns (h=1)"]].dropna(), time_col="Date")

# h = 4 (4 Week Price Returns)
btc_usd4_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Price Returns (h=4)"]].dropna(), time_col="Date")

# h = 12 (12 Week Price Returns)
btc_usd12_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Price Returns (h=12)"]].dropna(), time_col="Date")

### Create Price and Volume Time Series

In [4]:
price_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Price"]], time_col="Date")

In [5]:
vol_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Volume"]], time_col="Date")

#### Create UCRY Indices Time Series

In [6]:
# Create TimeSeries
sel_cols = ["Date", "Index Value"]
time_col = "Date"

# Lucey Price
lucey_price = (
    TimeSeries.from_dataframe(
        ucry_df[ucry_df.Index == "Lucey-Original-Price"].reset_index()[sel_cols],
        time_col=time_col
    )
)

# Lucey Reddit Price
lucey_reddit_price = (
    TimeSeries.from_dataframe(
        ucry_df[ucry_df.Index == "Lucey-Reddit-Price"].reset_index()[sel_cols],
        time_col=time_col
    )
)

# LDA Price
lda_price = (
    TimeSeries.from_dataframe(
        ucry_df[ucry_df.Index == "LDA-Reddit-Price"].reset_index()[sel_cols],
        time_col=time_col
    )
)

# Top2Vec Price
t2v_price = (
    TimeSeries.from_dataframe(
        ucry_df[ucry_df.Index == "Top2Vec-Reddit-Price"].reset_index()[sel_cols],
        time_col=time_col
    )
)

# VCRIX
vcrix = (
    TimeSeries.from_dataframe(
        ucry_df[ucry_df.Index == "VCRIX"].reset_index()[sel_cols],
        time_col=time_col
    )
)

#### Train Test Split Date

In [7]:
# Split into Train and Test
split_date = pd.Timestamp("20190527")

### Identify Suitable Lags for UCRY Index Time Series
* STATUS: Use **t** variables to predict **t + h** variables for now

### Random Forest Forecasting Model & Evaluation

#### Define Params Grid for Grid Search

In [8]:
# Params Grid

rf_params_grid = {
    'n_estimators': [50, 100, 300],
    'max_depth': [2, 5, 10],
    'criterion': ['squared_error'],
    'max_features': [1/3, 'auto'],
    'n_jobs': [-1],
}

rf_params_list = list(ParameterGrid(rf_params_grid))
len(rf_params_list)

18

### Random Forest GridSearch Helper

In [9]:
# Run Grid Search (Runs for super long may be buggy - Revisit in the future)
# rfA_1_best_model, rfA_1_best_params = RandomForest(
#    lags=1,
#     lags_past_covariates=1
# ).gridsearch(
#     parameters=rf_params_grid,
#     series=btc_usd1_ts,
#     past_covariates=rfA_1_past_covs,
#     forecast_horizon=1,
#     stride=1,
#     start=split_date,
#     metric=mape,
#     reduction=np.mean,
#     verbose=True,
#     n_jobs=-1
# )
# 
# pprint(rfA_1_best_params)

In [10]:
# Homemade RF Grid Search

def gridsearch_RF(
    series: TimeSeries,
    past_covariates: TimeSeries,
    forecast_horizon: int, 
    lags: int,
    lags_past_covariates: int,
    verbose: bool = False,
):

    min_error = np.float("inf")
    best_params = None

    for params in tqdm(rf_params_list):
        model = RandomForest(
            lags=lags,
            lags_past_covariates=lags_past_covariates
        )
        error = model.backtest(
            series=series,
            past_covariates=past_covariates,
            forecast_horizon=forecast_horizon,
            stride=1,
            start=split_date,
            metric=mape,
            reduction=np.mean,
            verbose=verbose,
        )
        if error < min_error:
            min_error = error
            best_params = params

    print("Average error (min_error) over all historical forecasts: %.2f" % min_error)
    print("Best Params: %s" % str(best_params))

    return best_params, min_error

#### Model A (Price Returns, Price & Volume)

##### h = 1

In [11]:
# Model A (h = 1) Past Covariates
rfA_1_past_covs = concatenate(
    [btc_usd1_ts,
    price_ts.slice_intersect(btc_usd1_ts),
    vol_ts.slice_intersect(btc_usd1_ts)],
    axis=1
)

In [12]:
rfA_1_best_params, rfA_1_min_error = gridsearch_RF(
    btc_usd1_ts,
    rfA_1_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:39<00:00, 12.20s/it]

Average error (min_error) over all historical forecasts: 211.96
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 100, 'n_jobs': -1}





#### h = 4

In [13]:
# Model A (h = 4) Past Covariates
rfA_4_past_covs = concatenate(
    [btc_usd4_ts,
    price_ts.slice_intersect(btc_usd4_ts),
    vol_ts.slice_intersect(btc_usd4_ts)],
    axis=1
)

In [14]:
rfA_4_best_params, rfA_4_min_error = gridsearch_RF(
    btc_usd4_ts,
    rfA_4_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:11<00:00, 10.61s/it]

Average error (min_error) over all historical forecasts: 823.02
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1}





#### h = 12

In [15]:
# Model A (h = 12) Past Covariates
rfA_12_past_covs = concatenate(
    [btc_usd12_ts,
    price_ts.slice_intersect(btc_usd12_ts),
    vol_ts.slice_intersect(btc_usd12_ts)],
    axis=1
)

In [16]:
rfA_12_best_params, rfA_12_min_error = gridsearch_RF(
    btc_usd12_ts,
    rfA_12_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:08<00:00, 10.46s/it]

Average error (min_error) over all historical forecasts: 92.26
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1}





#### Model B (Price Returns, Price & Volume + Lucey Price Index)

#### h = 1

In [19]:
# Model B (h = 1) Past Covariates
rfB_1_past_covs = concatenate(
    [btc_usd1_ts,
    price_ts.slice_intersect(btc_usd1_ts),
    vol_ts.slice_intersect(btc_usd1_ts),
    lucey_price.slice_intersect(btc_usd1_ts)],
    axis=1
)

In [20]:
rfB_1_best_params, rfB_1_min_error = gridsearch_RF(
    btc_usd1_ts,
    rfB_1_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [04:02<00:00, 13.48s/it]

Average error (min_error) over all historical forecasts: 245.69
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1}





#### h = 4

In [21]:
# Model B (h = 4) Past Covariates
rfB_4_past_covs = concatenate(
    [btc_usd4_ts,
    price_ts.slice_intersect(btc_usd4_ts),
    vol_ts.slice_intersect(btc_usd4_ts),
    lucey_price.slice_intersect(btc_usd4_ts)],
    axis=1
)

In [22]:
rfB_4_best_params, rfB_4_min_error = gridsearch_RF(
    btc_usd4_ts,
    rfB_4_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:33<00:00, 11.86s/it]

Average error (min_error) over all historical forecasts: 609.11
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 50, 'n_jobs': -1}





#### h = 12

In [23]:
# Model B (h = 12) Past Covariates
rfB_12_past_covs = concatenate(
    [btc_usd12_ts,
    price_ts.slice_intersect(btc_usd12_ts),
    vol_ts.slice_intersect(btc_usd12_ts),
    lucey_price.slice_intersect(btc_usd12_ts)],
    axis=1
)

In [24]:
rfB_12_best_params, rfB_12_min_error = gridsearch_RF(
    btc_usd12_ts,
    rfB_12_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:23<00:00, 11.28s/it]

Average error (min_error) over all historical forecasts: 91.07
Best Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1}





#### Model C (Price Returns, Price & Volume + Lucey Reddit Price Index)

#### h = 1

In [25]:
# Model C (h = 1) Past Covariates
rfC_1_past_covs = concatenate(
    [btc_usd1_ts,
    price_ts.slice_intersect(btc_usd1_ts),
    vol_ts.slice_intersect(btc_usd1_ts),
    lucey_reddit_price.slice_intersect(btc_usd1_ts)],
    axis=1
)

In [26]:
rfC_1_best_params, rfC_1_min_error = gridsearch_RF(
    btc_usd1_ts,
    rfC_1_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [04:09<00:00, 13.84s/it]

Average error (min_error) over all historical forecasts: 178.86
Best Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'n_jobs': -1}





#### h = 4

In [27]:
# Model C (h = 4) Past Covariates
rfC_4_past_covs = concatenate(
    [btc_usd4_ts,
    price_ts.slice_intersect(btc_usd4_ts),
    vol_ts.slice_intersect(btc_usd4_ts),
    lucey_reddit_price.slice_intersect(btc_usd4_ts)],
    axis=1
)

In [28]:
rfC_4_best_params, rfC_4_min_error = gridsearch_RF(
    btc_usd4_ts,
    rfC_4_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:32<00:00, 11.82s/it]

Average error (min_error) over all historical forecasts: 1213.74
Best Params: {'criterion': 'squared_error', 'max_depth': 10, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1}





#### h = 12

In [30]:
# Model C (h = 12) Past Covariates
rfC_12_past_covs = concatenate(
    [btc_usd12_ts,
    price_ts.slice_intersect(btc_usd12_ts),
    vol_ts.slice_intersect(btc_usd12_ts),
    lucey_reddit_price.slice_intersect(btc_usd12_ts)],
    axis=1
)

In [31]:
rfC_12_best_params, rfC_12_min_error = gridsearch_RF(
    btc_usd12_ts,
    rfC_12_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:19<00:00, 11.10s/it]

Average error (min_error) over all historical forecasts: 94.07
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1}





#### Model D (Price Returns, Price & Volume + LDA Reddit Price Index)

#### h = 1

In [32]:
# Model D (h = 1) Past Covariates
rfD_1_past_covs = concatenate(
    [btc_usd1_ts,
    price_ts.slice_intersect(btc_usd1_ts),
    vol_ts.slice_intersect(btc_usd1_ts),
    lda_price.slice_intersect(btc_usd1_ts)],
    axis=1
)

In [33]:
rfD_1_best_params, rfD_1_min_error = gridsearch_RF(
    btc_usd1_ts,
    rfD_1_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [04:04<00:00, 13.59s/it]

Average error (min_error) over all historical forecasts: 200.91
Best Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1}





#### h = 4

In [34]:
# Model D (h = 4) Past Covariates
rfD_4_past_covs = concatenate(
    [btc_usd4_ts,
    price_ts.slice_intersect(btc_usd4_ts),
    vol_ts.slice_intersect(btc_usd4_ts),
    lda_price.slice_intersect(btc_usd4_ts)],
    axis=1
)

In [35]:
rfD_4_best_params, rfD_4_min_error = gridsearch_RF(
    btc_usd4_ts,
    rfD_4_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:32<00:00, 11.82s/it]

Average error (min_error) over all historical forecasts: 785.84
Best Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'n_jobs': -1}





#### h = 12

In [36]:
# Model D (h = 12) Past Covariates
rfD_12_past_covs = concatenate(
    [btc_usd12_ts,
    price_ts.slice_intersect(btc_usd12_ts),
    vol_ts.slice_intersect(btc_usd12_ts),
    lda_price.slice_intersect(btc_usd12_ts)],
    axis=1
)

In [37]:
rfD_12_best_params, rfD_12_min_error = gridsearch_RF(
    btc_usd12_ts,
    rfD_12_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:20<00:00, 11.11s/it]

Average error (min_error) over all historical forecasts: 87.88
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1}





#### Model E (Price Returns, Price & Volume + Top2Vec Reddit Price Index)

#### h = 1

In [38]:
# Model E (h = 1) Past Covariates
rfE_1_past_covs = concatenate(
    [btc_usd1_ts,
    price_ts.slice_intersect(btc_usd1_ts),
    vol_ts.slice_intersect(btc_usd1_ts),
    t2v_price.slice_intersect(btc_usd1_ts)],
    axis=1
)

In [39]:
rfE_1_best_params, rfE_1_min_error = gridsearch_RF(
    btc_usd1_ts,
    rfE_1_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [04:07<00:00, 13.77s/it]

Average error (min_error) over all historical forecasts: 180.42
Best Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 50, 'n_jobs': -1}





#### h = 4

In [40]:
# Model E (h = 4) Past Covariates
rfE_4_past_covs = concatenate(
    [btc_usd4_ts,
    price_ts.slice_intersect(btc_usd4_ts),
    vol_ts.slice_intersect(btc_usd4_ts),
    t2v_price.slice_intersect(btc_usd4_ts)],
    axis=1
)

In [41]:
rfE_4_best_params, rfE_4_min_error = gridsearch_RF(
    btc_usd4_ts,
    rfE_4_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:36<00:00, 12.01s/it]

Average error (min_error) over all historical forecasts: 805.87
Best Params: {'criterion': 'squared_error', 'max_depth': 10, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1}





#### h = 12

In [42]:
# Model E (h = 12) Past Covariates
rfE_12_past_covs = concatenate(
    [btc_usd12_ts,
    price_ts.slice_intersect(btc_usd12_ts),
    vol_ts.slice_intersect(btc_usd12_ts),
    t2v_price.slice_intersect(btc_usd12_ts)],
    axis=1
)

In [43]:
rfE_12_best_params, rfE_12_min_error = gridsearch_RF(
    btc_usd12_ts,
    rfE_12_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:30<00:00, 11.70s/it]

Average error (min_error) over all historical forecasts: 88.07
Best Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'n_jobs': -1}





#### Model F (Price Returns, Price & Volume + VCRIX)

#### h = 1

In [60]:
# Model F (h = 1) Past Covariates
rfF_1_past_covs = concatenate(
    [btc_usd1_ts.slice_intersect(vcrix),
    price_ts.slice_intersect(vcrix),
    vol_ts.slice_intersect(vcrix),
    vcrix.slice_intersect(btc_usd1_ts.slice_intersect(vcrix))],
    axis=1
)

In [61]:
rfF_1_best_params, rfF_1_min_error = gridsearch_RF(
    btc_usd1_ts,
    rfF_1_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:18<00:00, 11.05s/it]

Average error (min_error) over all historical forecasts: 170.98
Best Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 50, 'n_jobs': -1}





#### h = 4

In [62]:
# Model F (h = 4) Past Covariates
rfF_4_past_covs = concatenate(
    [btc_usd4_ts.slice_intersect(vcrix),
    price_ts.slice_intersect(vcrix),
    vol_ts.slice_intersect(vcrix),
    vcrix.slice_intersect(btc_usd1_ts.slice_intersect(vcrix))],
    axis=1
)

In [63]:
rfF_4_best_params, rfF_4_min_error = gridsearch_RF(
    btc_usd4_ts,
    rfF_4_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [03:06<00:00, 10.34s/it]

Average error (min_error) over all historical forecasts: 921.30
Best Params: {'criterion': 'squared_error', 'max_depth': 10, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1}





#### h = 12

In [64]:
# Model F (h = 12) Past Covariates
rfF_12_past_covs = concatenate(
    [btc_usd12_ts.slice_intersect(vcrix),
    price_ts.slice_intersect(vcrix),
    vol_ts.slice_intersect(vcrix),
    vcrix.slice_intersect(btc_usd1_ts.slice_intersect(vcrix))],
    axis=1
)

In [65]:
rfF_12_best_params, rfF_12_min_error = gridsearch_RF(
    btc_usd12_ts,
    rfF_12_past_covs,
    1,
    1,
    1
)

100%|██████████| 18/18 [02:58<00:00,  9.93s/it]

Average error (min_error) over all historical forecasts: 89.80
Best Params: {'criterion': 'squared_error', 'max_depth': 10, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1}



