In [1]:
from models.xgboost_model import XgboostModel
from models.arima_model import ArimaModel
from models.sarima_model import SarimaModel
from models.prophet_model import ProphetModel
from models.random_forest_model import RandomForestModel
from models.lightgbm_model import LightGBMModel

import pandas as pd
import os

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
# Initialize models

model_xgb = XgboostModel()
model_arima = ArimaModel()
model_sarima = SarimaModel()
model_prophet = ProphetModel()
model_rf = RandomForestModel()
model_lgbm = LightGBMModel()

In [3]:
# Define the directory where the data was saved
input_dir = 'processed_data'
print(f"--- Loading processed data from '{input_dir}' directory ---")

# Load each file back into a variable
X_train = pd.read_pickle(os.path.join(input_dir, 'X_train.pkl'))
y_train = pd.read_pickle(os.path.join(input_dir, 'y_train.pkl'))
X_test = pd.read_pickle(os.path.join(input_dir, 'X_test.pkl'))
y_test = pd.read_pickle(os.path.join(input_dir, 'y_test.pkl'))

print("Data loaded successfully.")

# --- Sanity Check: Verify the loaded data ---
print("\nData Shapes:")
print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape}")
print(f"  y_test:  {y_test.shape}")

print("\nFirst 5 rows of X_train:")
print(X_train.head())

--- Loading processed data from 'processed_data' directory ---
Data loaded successfully.

Data Shapes:
  X_train: (3949, 26)
  y_train: (3949,)
  X_test:  (988, 26)
  y_test:  (988,)

First 5 rows of X_train:
            Open  High   Low  Close      Volume  price_return  log_return  \
Timestamp                                                                   
2012-02-03  6.26  6.35  5.93   6.29  283.382106      0.004792    0.004781   
2012-02-04  6.29  6.50  5.94   6.50   67.694994      0.033386    0.032841   
2012-02-05  6.50  6.50  5.70   5.70   49.866684     -0.123077   -0.131336   
2012-02-06  5.70  6.15  5.20   5.90   26.362078      0.035088    0.034486   
2012-02-07  5.90  5.90  5.50   5.51  151.424746     -0.066102   -0.068388   

               RSI_14  MACD_12_26_9  MACDh_12_26_9  ...   ATRr_14         OBV  \
Timestamp                                           ...                         
2012-02-03  52.119945     -0.113606      -0.000737  ...  0.909106  243.558728   
2012-02-

In [4]:
# Loop, train, predict, evaluate
models_to_run = [
    model_xgb,
    model_arima,
    model_sarima,
    model_prophet,
    model_rf,
    model_lgbm
]
results = {}


for model in models_to_run:
    # --- Univariate Statistical Models ---
    if model.model_name in ["ARIMA", "SARIMA"]:
        # ARIMA/SARIMA is trained only on the historical target series (y_train), doesn't use the X_train features.
        model.train(None, y_train)

        # Its predict method only needs to know how many steps to forecast.
        predictions = model.predict(X_test)

    # --- Specialized Time Series Models ---
    # Like ARIMA, it ignores the engineered features in X_train.
    elif model.model_name == "Prophet":
        model.train(None, y_train) # Pass None for X_train to be explicit
        predictions = model.predict(X_test)

    # --- Standard Supervised ML Models ---
    # These models use the engineered features in X_train to predict y_train.
    else:
        # These models are trained using the features (X_train)
        # to predict the target (y_train).
        model.train(X_train, y_train)

        # They then use the unseen test features (X_test) to make predictions.
        predictions = model.predict(X_test)

    # Ensure predictions have the correct index for evaluation and plotting
    predictions_series = pd.Series(predictions, index=y_test.index)

    # Evaluate and store results
    metrics = model.evaluate(y_test, predictions)
    results[model.model_name] = {'metrics': metrics, 'predictions': predictions}

print("\n----- All models have been run. -----")

Training XGBoost...
Training complete.
Predicting with XGBoost...
--- XGBoost Metrics ---
RMSE: 20967.0577
MAE: 12478.1693
Training ARIMA by finding the best parameters...
Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=63088.859, Time=0.46 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=63102.655, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=63093.963, Time=0.07 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=63094.032, Time=0.07 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=63100.799, Time=0.02 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=63097.938, Time=0.33 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=63097.967, Time=0.35 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=63096.003, Time=0.27 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=63095.920, Time=0.17 sec
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=63089.003, Time=1.57 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=63086.974, Time=0.34 sec
 ARIMA(0,1,1)(0,0,0)[0]             : AIC=63092.

13:46:37 - cmdstanpy - INFO - Chain [1] start processing
13:46:39 - cmdstanpy - INFO - Chain [1] done processing


Training complete.
Predicting with Prophet...
--- Prophet Metrics ---
RMSE: 33072.2554
MAE: 26102.2185
Training Random Forest...
Training complete.
Predicting with Random Forest...
--- Random Forest Metrics ---
RMSE: 19464.5832
MAE: 10828.3786
Training LightGBM...
Training complete.
Predicting with LightGBM...
--- LightGBM Metrics ---
RMSE: 20817.0635
MAE: 12185.6740

----- All models have been run. -----


In [None]:
import plotly.graph_objects as go

color_map = {
    'XGBoost': 'cyan',
    'ARIMA': 'orange',
    'SARIMA': 'gold',
    'Prophet': 'royalblue',
    'Random Forest': 'lightgreen',
    'LightGBM': 'magenta'
}

# Create a figure object (our canvas)
fig = go.Figure()

# Plot the real future prices from the test set
fig.add_trace(go.Scatter(
    x=y_test.index,
    y=y_test,
    mode='lines',
    name='Actual Future Price',
    line=dict(color='white', width=3) # Make the true line thick and white
))

for model_name, result_data in results.items():
    # Get the color for the current model from our map
    # .get() is used safely in case a model name is not in the map, it will default to grey
    model_color = color_map.get(model_name, 'grey')

    fig.add_trace(go.Scatter(
        x=y_test.index,
        y=result_data['predictions'],
        mode='lines',
        name=f'{model_name} Forecast',
        line=dict(color=model_color, width=1.5, dash='dot')
    ))

fig.update_layout(
    title='Model Forecast vs. Actual Price',
    xaxis_title='Date',
    yaxis_title='Bitcoin Price (USD)',
    template='plotly_dark',
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

fig.show()