In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import torch
import json
from sklearn.metrics import r2_score, root_mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from pipeline.walkforward import WFCVGenerator
from config.config_types import AppConfig
from utils.paths import CONFIG_DIR, VOL_EXPERIMENTS_DIR, DATA_DIR, PRICE_EXPERIMENTS_DIR
from models import create_model 


## OLS

In [43]:

# fix a fold
fold_num = 0
name = "exp_003_mlp_60_windows"
name = "exp_007_mlp_40_sliding"
name  = "exp_007_mlp_40_sliding"

name  = "exp_011_mlp_40"
trial = "trial_20251029_182517" 


# -------- load config --------
base = f"{PRICE_EXPERIMENTS_DIR}/{name}/{trial}/"

conifg_path = f"{base}config_snapshot.json"

with open(conifg_path, 'r') as f:
    cfg = json.load(f)

cfg = cfg["cfg"]

cfg = AppConfig.from_dict(cfg)

print(cfg.experiment)
print(cfg.walkforward)

if cfg.data["df_master"] is not None:
    df_master_path =  cfg.data["df_master"]
    df_master = pd.read_parquet(f"{DATA_DIR}/{df_master_path}")
    #print(f"provided df master: {df_master_path}\n{df_master.head()}")
else:
    df_master = None


ExperimentConfig(name='mlp_40', hyperparams_search=False, monitor='val_loss', mode='min', type='price_prediction', n_trials=10, random_state=42)
WFConfig(target_col='ret', lookback=0, ratio_train=3, ratio_val=1, ratio_test=1, step=251, lags=40, max_folds=None, scale=True, clip=False)


In [44]:

wf = WFCVGenerator(config=cfg.walkforward)
test_data = {}
data = {}
for i, fold_data in enumerate(wf.folds()):

    X_test = fold_data[4]
    y_test = fold_data[5]

    test_data[i] = [X_test, y_test]
    data[i] = fold_data
    if i == fold_num:
        break


In [40]:
X_tr, y_tr, X_val, y_val, X_test, y_test = data[fold_num] 

for arr in data[fold_num]:
    print(f"{np.mean(arr):.4f}, {np.std(arr):.4f}, {np.min(arr):.4f}, {np.max(arr):.4f}")

0.0000, 1.0000, -25.3492, 26.4821
0.0000, 1.0000, -25.3237, 26.4271
0.0421, 0.6408, -10.8975, 15.6728
0.0521, 0.6180, -10.8754, 15.6401
0.0106, 0.5195, -10.7222, 9.6848
0.0091, 0.5099, -10.7004, 7.2176


In [41]:


import statsmodels.api as sm


X_tr_const = sm.add_constant(X_tr)
X_te_const = sm.add_constant(X_test)
results = []
for j in range(y_tr.shape[1]):
    res = sm.OLS(y_tr[:, j].squeeze(), X_tr_const).fit()
    results.append(res)
    print(f"\n=== Target {j} ===")
    print(f"Rsq: {res.rsquared:.5f}")
    #print(res.summary())



=== Target 0 ===
Rsq: 0.00719


In [42]:
# create model
hparams = cfg.model.hparams
input_shape = (cfg.walkforward.lags,)
output_shape = cfg.walkforward.lookback+1
ckpt_path = f"{base}fold_{fold_num:03d}/model_best.pt"
checkpoint = torch.load(ckpt_path, map_location="cuda")
state_dict = {k.replace("_orig_mod.", ""): v for k, v in checkpoint["model_state"].items()}
model = create_model(cfg.model, input_shape, output_shape)
model.load_state_dict(state_dict)         
model.to("cuda").eval()

y_pred_test_nn = model(torch.as_tensor(X_test, dtype=torch.float32, device="cuda")).cpu().detach()
y_pred_train_nn = model(torch.as_tensor(X_tr, dtype=torch.float32, device="cuda")).cpu().detach()
y_pred_test_nn = y_pred_test_nn.cpu().numpy()
y_pred_train_nn = y_pred_train_nn.cpu().numpy()

i = 0
for dim in range(y_test.shape[1]):

    y_te_dim = y_test[:,dim]
    y_tr_dim = y_tr[:,dim]
    y_pred_test_ols = results[dim].predict(X_te_const)
    y_pred_tr_ols = results[dim].predict(X_tr_const)
    y_pred_test_nn_dim = y_pred_test_nn[:,dim]
    y_pred_train_nn_dim = y_pred_train_nn[:,dim]



    mse_test_ols = np.mean((y_te_dim - y_pred_test_ols)**2)
    mse_test_nn  = np.mean((y_te_dim - y_pred_test_nn_dim)**2)
    mse_tr_ols = np.mean((y_tr_dim - y_pred_tr_ols)**2)
    mse_tr_nn  = np.mean((y_tr_dim - y_pred_train_nn_dim)**2)

    dir_acc_test_ols = np.mean(np.sign(y_te_dim) == np.sign(y_pred_test_ols)) * 100
    dir_acc_test_nn = np.mean(np.sign(y_te_dim) == np.sign(y_pred_test_nn_dim)) * 100
    
    # Train set
    dir_acc_tr_ols = np.mean(np.sign(y_tr_dim) == np.sign(y_pred_tr_ols)) * 100
    dir_acc_tr_nn = np.mean(np.sign(y_tr_dim) == np.sign(y_pred_train_nn_dim)) * 100

    print(f'Dim: {i}')
    print("Model\tTrain MSE\tTrain Acc\tTest MSE\tTest Acc")
    print(f"OLS\t{mse_tr_ols:.5f}, \t{dir_acc_tr_ols:.2f}%, \t{mse_test_ols:.5f}, \t{dir_acc_test_ols:.2f}%")
    print(f"MLP\t{mse_tr_nn:.5f}, \t{dir_acc_tr_nn:.2f}%, \t{mse_test_nn:.5f}, \t{dir_acc_test_nn:.2f}%")
    print("-" * 50)

    i += 1
    

Dim: 0
Model	Train MSE	Train Acc	Test MSE	Test Acc
OLS	0.99281, 	52.00%, 	0.26191, 	49.57%
MLP	0.48345, 	65.20%, 	0.30437, 	50.72%
--------------------------------------------------


In [34]:


#plt.figure(figsize=(12,8))
#sns.scatterplot(x=y_test_dim, y=y_pred_ols, s=25, alpha=0.6, edgecolor=None)
#plt.xlabel("True")
#plt.ylabel("Predicted")
#plt.legend()
#plt.tight_layout()
#plt.show()


## Ridge

In [35]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import statsmodels.api as sm

# fix a fold
fold = 0

X_tr, y_tr, X_val, y_val, X_test, y_test = data[fold] 

X_tr_const = sm.add_constant(X_tr)
X_test_const = sm.add_constant(X_test)

# --- Define ridge regression (alpha = λ penalty strength) ---
ridge = Ridge(alpha=2, fit_intercept=False)  # intercept already added
ridge.fit(X_tr_const, y_tr)

y_pred_tr_ridge = ridge.predict(X_tr_const)
y_pred_te_ridge = ridge.predict(X_test_const)

r2_train = r2_score(y_tr, y_pred_tr_ridge)
r2_test = r2_score(y_test, y_pred_te_ridge)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_te_ridge))

print(f"R² train: {r2_train:.4f}, R² test: {r2_test:.4f}, RMSE test: {rmse_test:.6f}")

# --- View coefficients (with names if available) ---
coef_names = ["const"] + [f"x{i}" for i in range(X_tr.shape[1])]
for name, coef in zip(coef_names, ridge.coef_):
    print(f"{name:<8} {coef: .6f}")


R² train: 0.0072, R² test: -0.0073, RMSE test: 0.511771
const     0.000000
x0       -0.003493
x1        0.003752
x2       -0.007206
x3       -0.014402
x4       -0.006406
x5       -0.001036
x6       -0.028039
x7       -0.027452
x8       -0.017108
x9       -0.010208
x10      -0.010113
x11       0.019849
x12      -0.000167
x13       0.001001
x14       0.006934
x15      -0.009141
x16       0.013288
x17       0.003159
x18      -0.005657
x19      -0.008736
x20      -0.012195
x21      -0.023765
x22      -0.014709
x23      -0.000548
x24       0.018021
x25      -0.001526
x26      -0.010829
x27       0.014869
x28       0.010282
x29      -0.003309
x30      -0.017251
x31       0.004291
x32      -0.010054
x33       0.003491
x34      -0.016074
x35      -0.010637
x36      -0.000879
x37       0.000836
x38      -0.036394
x39       0.006292


In [36]:
y_pred_te_ridge

array([-0.00994603,  0.02722205,  0.00691171, ...,  0.00945086,
       -0.04514044, -0.05301743], shape=(104916,))

In [37]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred_te_ridge)
print(f"R² = {r2:.4f}")

plt.figure(figsize=(12,8))
sns.scatterplot(x=y_test, y=y_pred_te_ridge, s=25, alpha=0.6, edgecolor=None)
plt.xlabel("True")
plt.ylabel("Predicted")
plt.legend()
plt.tight_layout()
plt.show()


R² = -0.0073


ValueError: Per-column arrays must each be 1-dimensional

<Figure size 1200x800 with 0 Axes>

## ARIMA