# **Packages import**

In [5]:
import pandas as pd
from arch import arch_model
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt

# **Model training**

In [None]:
df_lagged = pd.read_csv("final_dataset.csv")

In [48]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [58]:
TARGET = "vol_future_2w"


df_model = df_lagged[["Date","vol_future_2w_1", "vol_future_2w_2", "vol_future_2w_3", "Performances glissantes 1 semaine", "rendement_lag_1", "garch_vol", "Variation %", TARGET]]

X = df_model[["vol_future_2w_1", "vol_future_2w_2", "vol_future_2w_3", "Performances glissantes 1 semaine", "rendement_lag_1", "garch_vol", "Variation %"]]
y = df_model[TARGET]

In [52]:
X

Unnamed: 0,vol_future_2w_1,vol_future_2w_2,vol_future_2w_3,Performances glissantes 1 semaine,rendement_lag_1,garch_vol,Variation %
0,0.002110,0.004182,0.006394,0.19,0.56,0.005777,1.382306
1,0.002300,0.002110,0.004182,0.23,0.19,0.005896,-0.827123
2,0.004610,0.002300,0.002110,0.23,0.23,0.005920,-0.696358
3,0.004539,0.004610,0.002300,0.61,0.23,0.005929,-1.441762
4,0.003482,0.004539,0.004610,0.20,0.61,0.005852,-0.126193
...,...,...,...,...,...,...,...
226,0.017265,0.014799,0.009153,1.87,0.94,0.006384,-3.772092
227,0.013888,0.017265,0.014799,1.57,1.87,0.007349,-2.645324
228,0.008930,0.013888,0.017265,1.18,1.57,0.007228,1.339285
229,0.003946,0.008930,0.013888,0.45,1.18,0.006714,-0.526713


## **XGBoost**

In [54]:
from xgboost import XGBRegressor

In [55]:
def dynamic_forecast_with_retraining(df, model_params, target_col='vol_future_2w'):
    """
    Iteratively predict missing values in `target_col` and retrain the model after each prediction.
    
    Parameters:
        df: DataFrame containing your last rows with NaNs
        model_params: dict of XGBRegressor parameters
        target_col: column to forecast
        lag_cols: lagged features used for prediction
        
    Returns:
        df: DataFrame with NaNs in target_col filled
    """
    df = df.copy()
    
    # Split into known and unknown indices
    known_idx = df[~df[target_col].isna()].index
    unknown_idx = df[df[target_col].isna()].index
    
    # Initial training set
    X_train_dyn = df.loc[known_idx, :]
    y_train_dyn = df.loc[known_idx, target_col]
    
    predictions = []

    for idx in unknown_idx:
        # Initialize and fit model
        xgb_dyn = XGBRegressor(
            **model_params,
            objective='reg:squarederror',
            eval_metric='rmse',
            random_state=42,
            n_jobs=-1
        )
        xgb_dyn.fit(X_train_dyn, y_train_dyn, verbose=0)

        # Predict next value
        X_pred = df.loc[idx, :].values.reshape(1, -1)
        y_pred = xgb_dyn.predict(X_pred)[0]
        
        # Save prediction
        df.loc[idx, target_col] = y_pred
        predictions.append(y_pred)
        
        # Update lagged features for next row
        next_idx = idx + 1
        if next_idx in df.index:
            df.loc[next_idx, 'vol_future_2w_1'] = df.loc[idx, target_col]
            df.loc[next_idx, 'vol_future_2w_2'] = df.loc[idx - 1, target_col] if idx - 1 in df.index else df.loc[idx, 'vol_future_2w_1']
            df.loc[next_idx, 'vol_future_2w_3'] = df.loc[idx - 2, target_col] if idx - 2 in df.index else df.loc[idx, 'vol_future_2w_2']
        
        # Extend training set with the newly predicted value
        X_train_dyn = pd.concat([X_train_dyn, df.loc[[idx], :]])
        y_train_dyn = pd.concat([y_train_dyn, df.loc[[idx], target_col]])

    return df, predictions


In [56]:
best_params = {
    'max_depth': 3,
    'min_child_weight': 5,
    'subsample': 0.8,
    'colsample_bytree': 1,
    'learning_rate': 0.01,
    'n_estimators': 1200
}

In [57]:
df_filled, preds = dynamic_forecast_with_retraining(df_model, best_params)
df_filled.tail()

Unnamed: 0,vol_future_2w_1,vol_future_2w_2,vol_future_2w_3,Performances glissantes 1 semaine,rendement_lag_1,garch_vol,Variation %,vol_future_2w
226,0.017265,0.014799,0.009153,1.87,0.94,0.006384,-3.772092,0.013888
227,0.013888,0.017265,0.014799,1.57,1.87,0.007349,-2.645324,0.00893
228,0.00893,0.013888,0.017265,1.18,1.57,0.007228,1.339285,0.003946
229,0.003946,0.00893,0.013888,0.45,1.18,0.006714,-0.526713,0.029696
230,0.029696,0.003946,0.00893,0.33,0.45,0.006186,0.548456,0.031095


In [60]:
df_filled_final = pd.concat([df_lagged[["Date"]], df_filled], axis=1)
df_filled_final

Unnamed: 0,Date,vol_future_2w_1,vol_future_2w_2,vol_future_2w_3,Performances glissantes 1 semaine,rendement_lag_1,garch_vol,Variation %,vol_future_2w
0,2021-01-29,0.002110,0.004182,0.006394,0.19,0.56,0.005777,1.382306,0.002300
1,2021-02-12,0.002300,0.002110,0.004182,0.23,0.19,0.005896,-0.827123,0.004610
2,2021-02-19,0.004610,0.002300,0.002110,0.23,0.23,0.005920,-0.696358,0.004539
3,2021-02-26,0.004539,0.004610,0.002300,0.61,0.23,0.005929,-1.441762,0.003482
4,2021-03-05,0.003482,0.004539,0.004610,0.20,0.61,0.005852,-0.126193,0.003377
...,...,...,...,...,...,...,...,...,...
226,2025-11-14,0.017265,0.014799,0.009153,1.87,0.94,0.006384,-3.772092,0.013888
227,2025-11-21,0.013888,0.017265,0.014799,1.57,1.87,0.007349,-2.645324,0.008930
228,2025-11-28,0.008930,0.013888,0.017265,1.18,1.57,0.007228,1.339285,0.003946
229,2025-12-05,0.003946,0.008930,0.013888,0.45,1.18,0.006714,-0.526713,0.029696


In [61]:
df_filled_final.to_csv("volatility_forecasted_dataset.csv", index = False)