In [1]:
# NB config
%load_ext autoreload
%autoreload 2

# Load Libraries
import os
import ast
os.chdir("../../")
from typing import Any, List, Dict, Union
import pandas as pd
import numpy as np
from pathlib import Path
from darts import TimeSeries
from darts.metrics import (
    rmse,
)
from tqdm import tqdm
from darts import concatenate
from darts.models.forecasting.random_forest import (
    RandomForest,
)
from sklearn.model_selection import ParameterGrid
import warnings

warnings.filterwarnings("ignore")

  VALID_INDEX_TYPES = (pd.DatetimeIndex, pd.RangeIndex, pd.Int64Index)
  times: Union[pd.DatetimeIndex, pd.Int64Index],
  def time_index(self) -> Union[pd.DatetimeIndex, pd.Int64Index]:
  pd.Int64Index,
  ) -> Union[pd.DatetimeIndex, pd.Int64Index]:
Importing plotly failed. Interactive plots will not work.
  SupportedIndex = Union[pd.DatetimeIndex, pd.Int64Index, pd.RangeIndex]


In [11]:
# Data Dir
data_dir = Path("forecasting/data/modelling")

# BTC-USD data
btc_usd_fp = data_dir / "btc_usd_weekly.csv"
btc_usd_df = pd.read_csv(btc_usd_fp)

# UCRY Indices data
ucry_fp = data_dir / "ucry_indices_weekly.csv"
ucry_df = pd.read_csv(ucry_fp)

# Best Model Paths
model_res_dir = Path("forecasting/data/forecasts/random_forest")
model_res_dicts = model_res_dir.rglob("*results.csv")
model_res_df = pd.concat([pd.read_csv(f) for f in model_res_dicts])
model_res_df["model"] = model_res_df["model"].map({
    "A": "baseline",
    "B": "A",
    "F": "B",
    "C": "C",
    "D": "D",
    "G": "E",
    "E": "Goodbye",
})
model_res_df.drop(columns=["Unnamed: 0"], inplace=True)

In [4]:
# h = 1 (Weekly Price Returns)
btc_usd_df["Price Returns (h=1)"] = np.log1p(btc_usd_df[["Price"]].pct_change(1))

# h = 4 (4 Week Price Returns)
btc_usd_df["Price Returns (h=4)"] = np.log1p(btc_usd_df[["Price"]].pct_change(4))

# h = 12 (12 Week Price Returns)
btc_usd_df["Price Returns (h=12)"] = np.log1p(btc_usd_df[["Price"]].pct_change(12))

# Create TimeSeries
# h = 1 (Weekly Price Returns)
btc_usd1_ts = TimeSeries.from_dataframe(
    btc_usd_df[["Date", "Price Returns (h=1)"]].dropna(), time_col="Date"
)

# h = 4 (4 Week Price Returns)
btc_usd4_ts = TimeSeries.from_dataframe(
    btc_usd_df[["Date", "Price Returns (h=4)"]].dropna(), time_col="Date"
)

# h = 12 (12 Week Price Returns)
btc_usd12_ts = TimeSeries.from_dataframe(
    btc_usd_df[["Date", "Price Returns (h=12)"]].dropna(), time_col="Date"
)

In [5]:
def gen_log_price_returns(
    series: pd.DataFrame, h: int, date_col: str = "Date", var_col: str = "Price"
) -> TimeSeries:
    new_col_name = f"Price Returns (h={h})"
    series[new_col_name] = np.log1p(series[[var_col]].pct_change(h))
    new_ts = TimeSeries.from_dataframe(
        series[[date_col, new_col_name]].dropna(), time_col=date_col
    )
    return new_ts

In [6]:
# Price
price_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Price"]], time_col="Date")

# Volume
vol_ts = TimeSeries.from_dataframe(btc_usd_df[["Date", "Volume"]], time_col="Date")

In [54]:
# Create TimeSeries
sel_cols = ["Date", "Index Value"]
time_col = "Date"

# Lucey Price
lucey_price = TimeSeries.from_dataframe(
    ucry_df[ucry_df.Index == "Lucey-Original-Price"].reset_index()[sel_cols],
    time_col=time_col,
)

# Lucey Reddit Price
lucey_reddit_price = TimeSeries.from_dataframe(
    ucry_df[ucry_df.Index == "Lucey-Reddit-Price"].reset_index()[sel_cols],
    time_col=time_col,
)

# LDA Price
lda_price = TimeSeries.from_dataframe(
    ucry_df[ucry_df.Index == "LDA-Reddit-Price"].reset_index()[sel_cols],
    time_col=time_col,
)

# Top2Vec Price
t2v_price = TimeSeries.from_dataframe(
    ucry_df[ucry_df.Index == "Top2Vec-Reddit-Price"].reset_index()[sel_cols],
    time_col=time_col,
)

# Hedge
hedge = TimeSeries.from_dataframe(
    ucry_df[ucry_df.Index == "BERTweet-Hedge"].reset_index()[sel_cols], time_col=time_col
)

# VCRIX
vcrix = TimeSeries.from_dataframe(
    ucry_df[ucry_df.Index == "VCRIX"].reset_index()[sel_cols], time_col=time_col
)

In [8]:
# Split into Train and Test
split_date = pd.Timestamp("20190527")

In [34]:
def hist_forecast_RF(
    series: TimeSeries,
    past_covariates: TimeSeries,
    forecast_horizon: int,
    best_params: Dict[str, Any],
    lags: int,
    lags_past_covariates: int,
    verbose: bool = False,
    split_date: pd.Timestamp = split_date,
):

    model = RandomForest(
        lags=lags, lags_past_covariates=lags_past_covariates, **best_params
    )
    hist_forecast = model.historical_forecasts(
        series=series,
        past_covariates=past_covariates,
        forecast_horizon=forecast_horizon,
        stride=1,
        start=split_date,
        verbose=verbose,
    )
    return hist_forecast, model

In [60]:
def get_feat_impt(model_name: str, horizon:int, past_cov_list: List[TimeSeries]):
    # Baseline h1
    import ast
    _params = ast.literal_eval(
        model_res_df[(model_res_df["model"] == model_name) & (model_res_df["horizon"] == horizon)]["best_params"].iloc[0]
    )

    print(f"Params: {_params}")

    # Generate Features
    target_cov = gen_log_price_returns(btc_usd_df, horizon)
    past_cov_list = list(
        map(lambda x: x.slice_intersect(target_cov), past_cov_list)
    )
    past_cov_list_tidy = concatenate(past_cov_list, axis=1)

    # Run Historical Forecast
    hist_forecast, hist_model = hist_forecast_RF(
        target_cov,
        past_cov_list_tidy,
        horizon,
        _params,
        1,
        1,
        True,
        split_date,
    )

    # backtest
    rf_model = RandomForest(
                lags=1, lags_past_covariates=1, **_params
            )
    error = rf_model.backtest(
        series=target_cov,
        past_covariates=past_cov_list_tidy,
        forecast_horizon=horizon,
        stride=1,
        start=split_date,
        metric=rmse,
        reduction=np.mean,
        verbose=True,
    )
    print(f"RMSE: {error}")

    # Feature Importances
    print(f"Number of Features: {hist_model.model.n_features_in_}")
    print(f"Feature Importances: {hist_model.model.feature_importances_}")

In [62]:
# Baseline
get_feat_impt("baseline", 1, [price_ts, vol_ts])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

RMSE: 0.0777166203366237
Number of Features: 3
Feature Importances: [0.3887802  0.30690305 0.30431675]


In [94]:
get_feat_impt("baseline", 2, [price_ts, vol_ts])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/135 [00:00<?, ?it/s]

In [None]:
get_feat_impt("baseline", 3, [price_ts, vol_ts])

In [66]:
get_feat_impt("baseline", 4, [price_ts, vol_ts])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

RMSE: 0.1788357924547611
Number of Features: 3
Feature Importances: [1. 0. 0.]


In [67]:
get_feat_impt("baseline", 4, [price_ts, vol_ts])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

RMSE: 0.1788357924547611
Number of Features: 3
Feature Importances: [1. 0. 0.]


### Interesting Models:
#### Model A (Lucey Price) for h = 1, 4, 12

In [63]:
get_feat_impt("A", 1, [price_ts, vol_ts, lucey_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

RMSE: 0.07779467099198285
Number of Features: 4
Feature Importances: [0.23914738 0.27608274 0.27819808 0.20657181]


In [86]:
get_feat_impt("A", 2, [price_ts, vol_ts, lucey_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

RMSE: 0.12455057928328443
Number of Features: 4
Feature Importances: [0.4063012  0.1527314  0.26330312 0.17766428]


In [87]:
get_feat_impt("A", 3, [price_ts, vol_ts, lucey_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

RMSE: 0.15732407642776403
Number of Features: 4
Feature Importances: [0.98977688 0.         0.00526075 0.00496238]


In [68]:
get_feat_impt("A", 4, [price_ts, vol_ts, lucey_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

RMSE: 0.17750674161824984
Number of Features: 4
Feature Importances: [9.99739051e-01 0.00000000e+00 0.00000000e+00 2.60948913e-04]


In [69]:
get_feat_impt("A", 12, [price_ts, vol_ts, lucey_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

RMSE: 0.35923502268501584
Number of Features: 4
Feature Importances: [1. 0. 0. 0.]


### Interesting Models:
#### Model B (VCRIX) for h = 1, 4, 12
Concatenation is different (FIX if necessary)

In [72]:
# get_feat_impt("B", 1, [vcrix,price_ts, vol_ts, ])

In [73]:
# get_feat_impt("B", 4, [price_ts, vol_ts, vcrix])

In [74]:
# get_feat_impt("B", 12, [price_ts, vol_ts, vcrix])

### Interesting Models:
#### Model C (Reddit Lucey Price) for h = 1, 4, 12

In [77]:
get_feat_impt("C", 1, [price_ts, vol_ts, lucey_reddit_price])

Params: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

RMSE: 0.07850194870249745
Number of Features: 4
Feature Importances: [0.25179375 0.22702638 0.26879421 0.25238565]


In [88]:
get_feat_impt("C", 2, [price_ts, vol_ts, lucey_reddit_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

RMSE: 0.1228124671523976
Number of Features: 4
Feature Importances: [0.41163471 0.14707532 0.26340312 0.17788685]


In [89]:
get_feat_impt("C", 3, [price_ts, vol_ts, lucey_reddit_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

RMSE: 0.15684366253348267
Number of Features: 4
Feature Importances: [0.98155624 0.         0.00533332 0.01311044]


In [78]:
get_feat_impt("C", 4, [price_ts, vol_ts, lucey_reddit_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

RMSE: 0.17470873853751404
Number of Features: 4
Feature Importances: [0.99122737 0.         0.         0.00877263]


In [79]:
get_feat_impt("C", 12, [price_ts, vol_ts, lucey_reddit_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

RMSE: 0.35997238732261466
Number of Features: 4
Feature Importances: [1. 0. 0. 0.]


### Interesting Models:
#### Model D (Reddit LDA Price) for h = 1, 4, 12

In [80]:
get_feat_impt("D", 1, [price_ts, vol_ts, lda_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

RMSE: 0.07632478912524142
Number of Features: 4
Feature Importances: [0.25771206 0.23298394 0.25865248 0.25065152]


In [90]:
get_feat_impt("D", 2, [price_ts, vol_ts, lda_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

RMSE: 0.12364009260363573
Number of Features: 4
Feature Importances: [0.41378543 0.14110466 0.25711264 0.18799727]


In [91]:
get_feat_impt("D", 3, [price_ts, vol_ts, lda_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

RMSE: 0.15612787805091816
Number of Features: 4
Feature Importances: [0.9716971  0.         0.00330616 0.02499674]


In [81]:
get_feat_impt("D", 4, [price_ts, vol_ts, lda_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

RMSE: 0.1769109175629804
Number of Features: 4
Feature Importances: [0.98659012 0.         0.         0.01340988]


In [82]:
get_feat_impt("D", 12, [price_ts, vol_ts, lda_price])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

RMSE: 0.35985160802393995
Number of Features: 4
Feature Importances: [1. 0. 0. 0.]


### Interesting Models:
#### Model E (Hedge) for h = 1, 4, 12

In [83]:
get_feat_impt("E", 1, [price_ts, vol_ts, hedge])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

RMSE: 0.07667054651478186
Number of Features: 4
Feature Importances: [0.22617194 0.24804106 0.26122378 0.26456322]


In [92]:
get_feat_impt("E", 2, [price_ts, vol_ts, hedge])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.3333333333333333, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

RMSE: 0.12138960096652898
Number of Features: 4
Feature Importances: [0.40427614 0.13590459 0.24999874 0.20982053]


In [93]:
get_feat_impt("E", 3, [price_ts, vol_ts, hedge])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

RMSE: 0.1575044958558237
Number of Features: 4
Feature Importances: [0.99188437 0.         0.00396965 0.00414599]


In [84]:
get_feat_impt("E", 4, [price_ts, vol_ts, hedge])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

RMSE: 0.17591040128711788
Number of Features: 4
Feature Importances: [0.9989627 0.        0.        0.0010373]


In [85]:
get_feat_impt("E", 12, [price_ts, vol_ts, hedge])

Params: {'criterion': 'squared_error', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 300, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}


  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

RMSE: 0.35988912246314664
Number of Features: 4
Feature Importances: [1. 0. 0. 0.]
