In [None]:
# here is what i have achieved, memorized it:
# 1. I have a 'synthetic_features' df stored 7-days forecasted of non-technical indicators
# 2. I have 'future_predictions_df' df stored prediction of next first day of 4 final arima-garch+lstm model and their ensemble prediction (total 5 prediction)
# 3. I have a 'high_low_vol_forecast" df stored 7-days forecasted of high, low, volume of bitcoin price
# Now I want you to help me implement our future prediction main workflow pipeline:
# 1. for the future prediction of first date, use the calculate_technical_indicators module above (I have stored it in a py file) to calculate the technical indicator from the high, low, volume, and the ensemble prediction from future_predictions_df
# 2. now we have full raw features (synthetics_features + technical_features) for the first date, use a scaler (loaded from saved of train_data) to transform those raw features.
# 3. Next, fit those PCs to arima_forecast, garch_volatility_forecast to find the arima_garch_forecast (i already have arimax_model.pkl, garch_model.pkl from train to load). Then, 

In [13]:
# Forecast Synthetic Non-Technical Features:

# Use models like ARIMA or VAR to predict non-technical indicators (e.g., sentiment, blockchain metrics) for the 7-day horizon.
# Recursive Loop (1 Day at a Time):

# Day 1:
# Forecast btc_price using ARIMA-GARCH (with PCs as exog_vars) + LSTM adjustment.
# Use the forecasted btc_price to calculate Day 2's technical indicators.
# Day 2–7:
# Repeat the process:
# Forecast btc_price for the next day.
# Recalculate technical indicators based on the latest forecasted price.
# Update the PCA-reduced feature set for the next prediction.
# Combine Predictions:

# At each step, compute:
# Final BTC Price
# =
# ARIMA-GARCH Forecast
# +
# Δ
# BTC Price
# .
# Final BTC Price=ARIMA-GARCH Forecast+ΔBTC Price.
# Output:

# The final 7-day BTC price forecast, along with updated synthetic features and technical indicators.

In [106]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from hyperparameter_tuning import tune_hyperparameters, lstm_model_builder
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.vector_ar.var_model import VAR
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from arima_garch import fit_sarima

In [107]:
# Load datasets
df = pd.read_csv("../data/final/dataset.csv", parse_dates=["Date"], index_col="Date")
test_pca_df = pd.read_csv("../data/final/test_pca_df.csv", parse_dates=["Date"], index_col="Date")
test_residuals_df = pd.read_csv("../data/final/test_residuals_df.csv", parse_dates=["Date"], index_col="Date")
test_arima_garch_pred = test_residuals_df["SARIMA-GARCH Prediction"]
test_residual = test_residuals_df["Residuals"]

scaler = joblib.load("../models/residual_scaler.pkl")
test_residual_scaled = scaler.transform(test_residual.values.reshape(-1, 1))

### 7-days residuals

In [110]:
model_dir = "../models"
look_back = 20  # Number of days to look back for LSTM models

future_days = 7  # Number of days to predict
future_dates = pd.date_range(test_pca_df.index[-1], periods=future_days + 1, freq="D")[1:]

# Dictionary to store future predictions
final_residuals = {}

for model_type in ["LSTM", "BiLSTM", "Attention-LSTM", "Attention-BiLSTM"]:
    print(f"Generating future predictions for {model_type}...")

    # Load the best model and parameters
    model_file = f"{model_dir}/{model_type}_best_model.pkl"
    param_file = f"{model_dir}/{model_type}_best_params.pkl"

    best_model = joblib.load(model_file)
    best_params = joblib.load(param_file)

    # Access the underlying Keras model
    keras_model = best_model.model_

    # Start with the last `look_back` days of residuals from the test set
    input_sequence = test_residual_scaled[-look_back:].reshape(1, look_back, 1)
    future_residuals = []  # To store predicted residuals

    for _ in range(future_days):
        # Predict the next residual
        next_residual = keras_model.predict(input_sequence)[0, 0]
        future_residuals.append(next_residual)

        # Update input sequence by appending the predicted residual
        next_residual = np.array([[next_residual]])  # Reshape to (1, 1)
        input_sequence = np.append(input_sequence[:, 1:, :], next_residual[:, np.newaxis, :], axis=1)

    # Inverse transform predicted residuals
    future_residuals = np.array(future_residuals).reshape(-1, 1)
    future_residuals_inverse = scaler.inverse_transform(future_residuals).flatten()


    # Store the future residual
    final_residuals[model_type] = np.exp(future_residuals_inverse) - 1  # Undo log transform

    print(f"Future predictions for {model_type}: {final_residuals[model_type]}")

# Ensemble future prediction
ensemble_future_pred = np.mean([final_residuals[model_type] for model_type in final_residuals], axis=0)
final_residuals["Ensemble"] = ensemble_future_pred

# Convert future predictions to a DataFrame for visualization
final_residual_df = pd.DataFrame(final_residuals, index=future_dates)

final_residual_df

Generating future predictions for LSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Future predictions for LSTM: [1.1475611 1.2598829 1.327847  1.3718042 1.4148743 1.4621227 1.5024335]
Generating future predictions for BiLSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━

Unnamed: 0,LSTM,BiLSTM,Attention-LSTM,Attention-BiLSTM,Ensemble
2024-11-17 00:00:00+00:00,1.147561,1.775028,1.334735,1.743214,1.500135
2024-11-18 00:00:00+00:00,1.259883,1.99819,1.437381,1.804345,1.62495
2024-11-19 00:00:00+00:00,1.327847,2.279903,1.581084,1.810071,1.749726
2024-11-20 00:00:00+00:00,1.371804,2.636033,1.725159,1.848895,1.895473
2024-11-21 00:00:00+00:00,1.414874,3.058569,1.823609,1.883738,2.045197
2024-11-22 00:00:00+00:00,1.462123,3.537609,1.860906,1.924117,2.196188
2024-11-23 00:00:00+00:00,1.502434,4.040752,1.863879,1.961931,2.342249


### Synthetic Feature Indicator Generation

In [81]:
# Function to forecast using ARIMA
def fit_arima(series, steps=7, order=(5, 1, 0)):
    model = ARIMA(series, order=order)  # Adjust (p, d, q) as needed
    model_fit = model.fit()
    # Get predictions for the entire series
    series_pred = model_fit.predict(start=0, end=len(series) - 1)
    forecast = model_fit.forecast(steps=steps)
    return series_pred, forecast

# def forecast_sarima(series, steps=7, order=(1, 1, 2), seasonal_order=(1, 1, 1, 7)):
#     # Fit the SARIMA model
#     model = SARIMAX(series, order=order, freq='D', seasonal_order=seasonal_order)
#     model_fit = model.fit(disp=False)

#     # Get predictions for the entire series
#     series_pred = model_fit.predict(start=0, end=len(series) - 1)
#     # Forecast future values
#     forecast = model_fit.forecast(steps=steps)
#     return series_pred, forecast

# Function to forecast using VAR for interdependent features
def forecast_var(data, steps=7):
    model = VAR(data)
    model_fit = model.fit(maxlags=15, ic='aic')  # Adjust lag selection as needed
    lag_order = model_fit.k_ar # get optimal lag order
    forecast_input = data.values[-lag_order:]  # Extract the last lag_order observations
    forecast = model_fit.forecast(y=forecast_input, steps=steps)
    return pd.DataFrame(forecast, columns=data.columns)

# Evaluate forecast
def evaluate_forecast(train_data, sarima_pred):
    # Ensure input is numpy array for consistency
    train_data = np.array(train_data)
    sarima_pred = np.array(sarima_pred)

    # Check if lengths match
    if len(train_data) != len(sarima_pred):
        raise ValueError("Length of train_data and sarima_pred must be the same")

    rmse = root_mean_squared_error(train_data, sarima_pred)
    mae = mean_absolute_error(train_data, sarima_pred)
    mape = np.mean(np.abs((train_data - sarima_pred) / (train_data + 1e-10))) * 100

    return {"RMSE": rmse,"MAE": mae,"MAPE": mape}

In [94]:
# Load SARIMA parameters from JSON
with open('../results/metrics/sarima_params.json', 'r') as f:
    sarima_params = json.load(f)

In [77]:
# Define the list of technical indicators
technical_features = [
    'btc_sma_14',
    'btc_ema_14', 'btc_rsi_14', 'btc_macd', 'btc_macd_signal',
    'btc_macd_diff', 'btc_bb_high', 'btc_bb_low', 'btc_bb_mid',
    'btc_bb_width', 'btc_atr_14', 'btc_trading_volume',
    'btc_volatility_index'
]

non_technical_features = [col for col in df.columns if col not in [technical_features, 'btc_close']]

train_data = df[non_technical_features].copy()

# Initialize a dictionary to store scalers for each column
column_scalers = {}

# Create an empty DataFrame to store scaled data
scaled_data = pd.DataFrame(index=train_data.index, columns=non_technical_features)

# Fit a scaler for each column and transform the data
for col in non_technical_features:
    scaler = StandardScaler()
    scaled_data[col] = scaler.fit_transform(train_data[col].values.reshape(-1, 1)).flatten()
    column_scalers[col] = scaler  # Save the scaler for later use

synthetic_features = pd.DataFrame(index=pd.date_range(start=df.index[-1] + pd.Timedelta(days=1), periods=7))

In [78]:
# Sentiment Indicators (arima/sarima)
sentiment_indicator = 'google_trends_bitcoin'
sentiment_param = sarima_params[sentiment_indicator]

y_train = scaled_data[sentiment_indicator]

sarima_results = fit_sarima(y_train, order=tuple(sentiment_param['order']), seasonal_order=tuple(sentiment_param['seasonal_order']))

sarima_forecast = sarima_results['forecast'](steps=7)

metric = evaluate_forecast(y_train, sarima_results['train_predictions'])

synthetic_features[sentiment_indicator] = column_scalers[sentiment_indicator].inverse_transform(sarima_forecast.values.reshape(-1,1)).flatten()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [86]:
# Blockchain Indicators (var)
blockchain_indicator = ['active_addresses_blockchain', 'hash_rate_blockchain', 'miner_revenue_blockchain']

# Forecast VAR features
var_forecasts = forecast_var(scaled_data[blockchain_indicator], steps=7)

for indicator in blockchain_indicator:
    synthetic_features[indicator] = column_scalers[indicator].inverse_transform(var_forecasts[indicator].values.reshape(-1,1)).flatten()

  self._init_dates(dates, freq)


In [None]:
# Macro/Market Indicators (ARIMA/SARIMA)
macro_indicator = [
    'Gold_Close', 'Oil_Close', 'DJI', 'GSPC',
    'IXIC', 'NYSE FANG+', 'ARK Innovation ETF', 'CBOE Volatility Index',
    'iShares MSCI Emerging Markets ETF', 'Shanghai Composite Index',
    'USD Index (DXY)', 'EUR to USD Exchange Rate'
]

# Dictionary to store SARIMA results and metrics
sarima_results = {}
metrics = {}

# Loop through each macro indicator
for indicator in macro_indicator:
    # Extract SARIMA parameters for the current indicator from sarima_params
    if indicator in sarima_params:
        order = tuple(sarima_params[indicator]['order'])
        seasonal_order = tuple(sarima_params[indicator]['seasonal_order'])
    else:
        raise ValueError(f"SARIMA parameters for {indicator} not found in sarima_params.")
    
    # Extract training data for the current indicator
    y_train = scaled_data[indicator]
    
    # Fit SARIMA model
    sarima_result = fit_sarima(y_train, order=order, seasonal_order=seasonal_order)
    
    # Forecast for the next 7 steps
    sarima_forecast = sarima_result['forecast'](steps=7)
    
    # Evaluate the forecast
    metric = evaluate_forecast(y_train, sarima_result['train_predictions'])
    
    # Inverse transform the forecast using the corresponding scaler
    inverse_forecast = column_scalers[indicator].inverse_transform(sarima_forecast.values.reshape(-1, 1)).flatten()
    
    # Store results
    sarima_results[indicator] = sarima_result
    metrics[indicator] = metric
    
    # Assign the inverse-transformed forecast to synthetic_features
    synthetic_features[indicator] = inverse_forecast

# Print metrics for evaluation
for indicator, metric in metrics.items():
    print(f"Metrics for {indicator}: {metric}")

### Predict First Date

In [108]:
model_dir = "../models"
look_back = 20  # Number of days to look back for LSTM models

future_days = 1  # Number of days to predict
future_dates = pd.date_range(test_pca_df.index[-1], periods=future_days + 1, freq="D")[1:]

# Dictionary to store future predictions
future_predictions = {}

for model_type in ["LSTM", "BiLSTM", "Attention-LSTM", "Attention-BiLSTM"]:
    print(f"Generating future predictions for {model_type}...")

    # Load the best model and parameters
    model_file = f"{model_dir}/{model_type}_best_model.pkl"
    param_file = f"{model_dir}/{model_type}_best_params.pkl"

    best_model = joblib.load(model_file)
    best_params = joblib.load(param_file)

    # Access the underlying Keras model
    keras_model = best_model.model_

    # Start with the last `look_back` days of residuals from the test set
    input_sequence = test_residual_scaled[-look_back:].reshape(1, look_back, 1)
    future_residuals = []  # To store predicted residuals

    for _ in range(future_days):
        # Predict the next residual
        next_residual = keras_model.predict(input_sequence)[0, 0]
        future_residuals.append(next_residual)

        # Update input sequence by appending the predicted residual
        next_residual = np.array([[next_residual]])  # Reshape to (1, 1)
        input_sequence = np.append(input_sequence[:, 1:, :], next_residual[:, np.newaxis, :], axis=1)

    # Inverse transform predicted residuals
    future_residuals = np.array(future_residuals).reshape(-1, 1)
    future_residuals_inverse = scaler.inverse_transform(future_residuals).flatten()

    # Combine ARIMA-GARCH predictions and LSTM residuals for final forecast
    arima_garch_future_pred = test_arima_garch_pred.iloc[-1]  # Start from the last ARIMA-GARCH prediction
    final_future_forecast_list = []
    for residual in future_residuals_inverse:
        final_future_forecast = arima_garch_future_pred + residual
        final_future_forecast_list.append(final_future_forecast)

    # Store the future predictions
    future_predictions[model_type] = np.exp(final_future_forecast_list) - 1  # Undo log transform

    print(f"Future predictions for {model_type}: {future_predictions[model_type]}")

# Ensemble future prediction
ensemble_future_pred = np.mean([future_predictions[model_type] for model_type in future_predictions], axis=0)
future_predictions["Ensemble"] = ensemble_future_pred

# Convert future predictions to a DataFrame for visualization
future_predictions_df = pd.DataFrame(future_predictions, index=future_dates)

future_predictions_df

Generating future predictions for LSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
Future predictions for LSTM: [68883.67908539]
Generating future predictions for BiLSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step
Future predictions for BiLSTM: [89010.17242435]
Generating future predictions for Attention-LSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
Future predictions for Attention-LSTM: [74887.43709936]
Generating future predictions for Attention-BiLSTM...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352ms/step
Future predictions for Attention-BiLSTM: [87989.71142955]


Unnamed: 0,LSTM,BiLSTM,Attention-LSTM,Attention-BiLSTM,Ensemble
2024-11-17 00:00:00+00:00,68883.679085,89010.172424,74887.437099,87989.71143,80192.75001


In [95]:
btc_price = pd.read_csv("../data/raw/historical_data.csv", parse_dates=["Date"], index_col="Date")

# Extract high (High), low (Low), and volume (Volume) columns
high_low_vol = ['High', 'Low', 'Volume']

btc_train = btc_price[high_low_vol].copy()

# Initialize a dictionary to store scalers for each column
btc_scalers = {}

# Create an empty DataFrame to store scaled data
scaled_btc = pd.DataFrame(index=btc_train.index, columns=high_low_vol)

# Fit a scaler for each column and transform the data
for col in high_low_vol:
    scaler = StandardScaler()
    scaled_btc[col] = scaler.fit_transform(btc_train[col].values.reshape(-1, 1)).flatten()
    btc_scalers[col] = scaler  # Save the scaler for later use

In [102]:
# # Dictionary to store SARIMA results and metrics
# sarima_results = {}
# metrics = {}
high_low_vol_forecast = pd.DataFrame(index=pd.date_range(start=df.index[-1] + pd.Timedelta(days=1), periods=7))

# Loop through each macro indicator
for indicator in high_low_vol:
    # Extract SARIMA parameters for the current indicator from sarima_params
    if indicator in sarima_params:
        order = tuple(sarima_params[indicator]['order'])
        seasonal_order = tuple(sarima_params[indicator]['seasonal_order'])
    else:
        raise ValueError(f"SARIMA parameters for {indicator} not found in sarima_params.")
    
    # Extract training data for the current indicator
    y_train = scaled_btc[indicator]
    
    # Fit SARIMA model
    sarima_result = fit_sarima(y_train, order=order, seasonal_order=seasonal_order)
    
    # Forecast for the next 7 steps
    sarima_forecast = sarima_result['forecast'](steps=7)
    
    # Evaluate the forecast
    metric = evaluate_forecast(y_train, sarima_result['train_predictions'])
    
    # Inverse transform the forecast using the corresponding scaler
    inverse_forecast = btc_scalers[indicator].inverse_transform(sarima_forecast.values.reshape(-1, 1)).flatten()
    
    # Store results
    sarima_results[indicator] = sarima_result
    metrics[indicator] = metric
    
    # Assign the inverse-transformed forecast to synthetic_features
    high_low_vol_forecast[indicator] = inverse_forecast

# Print metrics for evaluation
for indicator, metric in metrics.items():
    print(f"Metrics for {indicator}: {metric}")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Metrics for High: {'RMSE': 0.04054129733236314, 'MAE': 0.018541818588258083, 'MAPE': 20.105934102653226}
Metrics for Low: {'RMSE': 0.04767266205687547, 'MAE': 0.02128387967773765, 'MAPE': 6.6521278132224495}
Metrics for Volume: {'RMSE': 0.4461574468890595, 'MAE': 0.1892674912479586, 'MAPE': 149.549605857712}


In [103]:
high_low_vol_forecast

Unnamed: 0,High,Low,Volume
2024-11-17 00:00:00+00:00,91743.122097,91049.717734,77956240000.0
2024-11-18 00:00:00+00:00,91740.882586,90869.459341,74539240000.0
2024-11-19 00:00:00+00:00,91740.562937,90789.099831,80960630000.0
2024-11-20 00:00:00+00:00,91740.517314,90834.314226,81539350000.0
2024-11-21 00:00:00+00:00,91740.510802,90794.706787,82521130000.0
2024-11-22 00:00:00+00:00,91740.509872,90888.600434,77067500000.0
2024-11-23 00:00:00+00:00,91740.509739,90955.787994,75654150000.0


In [None]:
# Technical Indicators (calculated from forecasted btc_price)



### PCA transform future indicators

In [None]:
### PCA transformation

### ARIMA-GARCH Forecasting

In [None]:
# arima forecast

# extract residual to fit into garch

# garch forecast

# combine into arima-garch forecast

# residual = btc_price - arima-garch forecast

### LSTM Forecasting

In [None]:
# use future residual from arima-garch forecast to predict


### Sequence Learning ???