# Forecasting EURUSD volatility

In [1]:
import os
if os.getcwd().split(os.sep)[-1] == "notebooks":
    os.chdir("..")

import plotly.express as px
import numpy as np
import pandas as pd

from data import retrieve_data, add_str_dates, add_returns, resample_frequency
from utils import evaluate_model

In [2]:
underlying = "aaplususd"
underlying_name = "AAPL/USD"

## Data generation

In [3]:
raw_data = retrieve_data("aaplususd", "2025-01-01", "2025-06-01")

In [4]:
raw_data.dropna(inplace=True)

## Pre treatment

In [5]:
pre_data = add_str_dates(raw_data, unit="m")

In [6]:
pre_data.head()

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-01-02 14:30:00,28930470.0,248.866,249.026,246.296,246.336,12.024,2025-01-02 14:30:00
2025-01-02 14:35:00,28930475.0,246.297,246.397,245.067,245.207,17.292,2025-01-02 14:35:00
2025-01-02 14:40:00,28930480.0,245.207,245.796,244.796,245.176,12.3,2025-01-02 14:40:00
2025-01-02 14:45:00,28930485.0,245.177,245.926,244.956,245.857,13.824,2025-01-02 14:45:00
2025-01-02 14:50:00,28930490.0,245.857,246.456,245.496,246.396,12.804,2025-01-02 14:50:00


In [7]:
px.scatter(pre_data.iloc[::5], x="datetime", y="close", title=underlying_name)

In [8]:
add_returns(pre_data)
pre_data.head()

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,datetime,session_break,session_id,log_close,returns,z_score
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2025-01-02 14:30:00,28930470.0,248.866,249.026,246.296,246.336,12.024,2025-01-02 14:30:00,False,0,5.506696,0.0,
2025-01-02 14:35:00,28930475.0,246.297,246.397,245.067,245.207,17.292,2025-01-02 14:35:00,False,0,5.502103,-0.004594,
2025-01-02 14:40:00,28930480.0,245.207,245.796,244.796,245.176,12.3,2025-01-02 14:40:00,False,0,5.501976,-0.000126,
2025-01-02 14:45:00,28930485.0,245.177,245.926,244.956,245.857,13.824,2025-01-02 14:45:00,False,0,5.50475,0.002774,
2025-01-02 14:50:00,28930490.0,245.857,246.456,245.496,246.396,12.804,2025-01-02 14:50:00,False,0,5.50694,0.00219,


In [9]:
resampled_data = resample_frequency(pre_data, frequency="1h")
resampled_data.head()

Unnamed: 0_level_0,timestamp,datetime,open,high,low,close,volume,var,vol,session_break,session_id,returns
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2025-01-02 14:00:00,28930470.0,2025-01-02 14:30:00,248.866,249.026,244.796,246.846,79.548,1.2e-05,0.003505,False,0.0,0.002068
2025-01-02 15:00:00,28930500.0,2025-01-02 15:00:00,246.846,247.796,244.806,245.386,113.532,3e-06,0.001811,False,0.0,-0.005932
2025-01-02 16:00:00,28930560.0,2025-01-02 16:00:00,245.386,245.457,244.156,244.657,85.932,2e-06,0.001279,False,0.0,-0.002975
2025-01-02 17:00:00,28930620.0,2025-01-02 17:00:00,244.676,245.217,242.296,242.946,104.592,2e-06,0.001418,False,0.0,-0.007018
2025-01-02 18:00:00,28930680.0,2025-01-02 18:00:00,242.947,243.016,241.807,242.226,98.364,1e-06,0.001134,False,0.0,-0.002968


## Model HAR-RV

In [10]:
resampled_data["var_1"] = resampled_data.groupby("session_id")["var"].shift(1)
# daily = 7h
resampled_data["var_d"] = resampled_data.groupby("session_id")["var"].rolling(window=7, min_periods=5).mean().shift(1).reset_index(level=0, drop=True)
# weekly = a session
resampled_data["var_w"] = resampled_data.groupby("session_id")["var"].transform("mean").shift(1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_part_pct = 0.7
model_data = resampled_data[["var", "var_1", "var_d", "var_w"]].dropna().copy()

X = scaler.fit_transform(model_data[["var_1", "var_d", "var_w"]].values)
y = model_data["var"].values
X_train, X_test = X[:int(len(X)*train_part_pct)], X[int(len(X)*train_part_pct):]
y_train, y_test = y[:int(len(y)*train_part_pct)], y[int(len(y)*train_part_pct):]

### Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

y_pred = np.maximum(y_pred, 0)
print("Coeffs:", lr_model.coef_)
print("Intercept:", lr_model.intercept_)
evaluate_model(y_test, y_pred, model_name="Linear Regression")

Coeffs: [-1.43443073e-06 -3.24003340e-05  4.05979271e-05]
Intercept: 3.4317150227647223e-06
Linear Regression - MSE: 6.366515923863387e-13, MAE: 5.458384494146441e-07


In [67]:
from sklearn.utils import resample

coeffs = []
for _ in range(10000):
    x_sample, y_sample = resample(X, y)
    model = LinearRegression()
    model.fit(x_sample, y_sample)
    coeffs.append(model.coef_ + [model.intercept_])

std_devs = np.std(coeffs, axis=0)
means = np.mean(coeffs, axis=0)
ratios  = np.abs(std_devs / means)
print("Coefficient stability (std dev):", std_devs)
print("Coefficient stability (mean):", means)
print("Coefficient stability (std/mean):", ratios)

Coefficient stability (std dev): [4.05957189e-06 1.03337176e-05 1.13127954e-05]
Coefficient stability (mean): [ 4.59985271e-06 -2.65661003e-05  4.03639520e-05]
Coefficient stability (std/mean): [0.88254389 0.38898135 0.28026977]


### Ridge

In [38]:
from sklearn.linear_model import Ridge
alpha = 0.5
lr_model = Ridge(alpha=alpha)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

y_pred = np.maximum(y_pred, 0)
print("Coeffs:", lr_model.coef_)
print("Intercept:", lr_model.intercept_)
evaluate_model(y_test, y_pred, model_name="Ridge Regression")

Coeffs: [-3.12519981e-07 -2.08762730e-05  2.82692344e-05]
Intercept: 3.4488006744608897e-06
Linear Regression - MSE: 6.174107158129434e-13, MAE: 4.818314209152372e-07


In [72]:
coeffs = []
for _ in range(10000):
    x_sample, y_sample = resample(X, y)
    model = Ridge(alpha=alpha)
    model.fit(x_sample, y_sample)
    coeffs.append(model.coef_ + [model.intercept_])

std_devs = np.std(coeffs, axis=0)
means = np.mean(coeffs, axis=0)
ratios  = np.abs(std_devs / means)
print("Coefficient stability (std dev):", std_devs)
print("Coefficient stability (mean):", means)
print("Coefficient stability (std/mean):", ratios)

Coefficient stability (std dev): [5.46750730e-06 6.83266350e-06 8.54264373e-06]
Coefficient stability (mean): [ 7.08725150e-06 -1.27373413e-05  2.57129501e-05]
Coefficient stability (std/mean): [0.77145665 0.53642776 0.33223118]
