# Libraries

In [None]:
import numpy as np
import chardet
import joblib
from collections import defaultdict
import os
import re
import json

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt 
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.colors as pc
%matplotlib inline  
import psutil
from pathlib import Path
#from Functions import *
from scipy.io import loadmat
import glob
from scipy.signal import savgol_filter
from tqdm import tqdm
from sysidentpy.model_structure_selection import FROLS
from sysidentpy.basis_function import Polynomial
from sysidentpy.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import time
import psutil
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
import optuna
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, r2_score

from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit 

from optuna.pruners import MedianPruner



In [None]:
# Set seeds for full reproducibility
import random

random.seed(42)
np.random.seed(42)
os.environ['PYTHONHASHSEED'] = '42'

# Functions

In [None]:

def predict_recursive_series(model, X_df, output_cols, X_feature_names, na):
    """
    Predict recursively one step at a time using lag updates.

    Parameters
    ----------
    model : sklearn pipeline
        Trained model
    X_df : pd.DataFrame
        Input features (with lag columns, same format as training)
    output_cols : list of str
        List of output columns (e.g., ['heave', 'pitch', ...])
    X_feature_names : list
        Ordered list of input feature names expected by the model
    na : int
        Number of output lags

    Returns
    -------
    y_pred_df : pd.DataFrame
        Recursive prediction results (same shape as y_df)
    x_used_df : pd.DataFrame
        Input rows actually used at each timestep (after lag updates)
    """
    import numpy as np
    import pandas as pd

    if na == 0:
        y_pred = model.predict(X_df)
        return pd.DataFrame(y_pred, columns=output_cols), X_df.copy()

    n_steps = len(X_df)
    n_outputs = len(output_cols)

    y_pred = np.zeros((n_steps, n_outputs))
    x_used_rows = []

    # Start with the first input row
    x_row = X_df.iloc[0].copy()
    
    value_threshold = 5

    for t in range(n_steps):
        
        # Check if this row contains any absurd value
        if np.any(np.abs(x_row.values) > value_threshold):
            print(f"[ABORT] Found large value at step t={t}. Aborting prediction.")
            return pd.DataFrame(np.zeros((n_steps, n_outputs)), columns=output_cols), pd.DataFrame(x_used_rows)
        
        # Build input feature vector for model
        x_input = pd.DataFrame([[x_row[feat] for feat in X_feature_names]], columns=X_feature_names)

        # Save the input row used at this step
        x_used_rows.append(x_input.iloc[0])

        # Predict
        pred = model.predict(x_input)
        # Ensure it's always a 2D shape: (1, n_outputs)
        if pred.ndim == 1:
            pred = pred.reshape(1, -1)

        pred = pred[0]  # Now this is always indexable

        y_pred[t] = pred

        if t < n_steps - 1:
            # Prepare next input row
            x_row_next = X_df.iloc[t + 1].copy()

            for col in output_cols:
                # Shift lags: lag_n = lag_{n-1}, ..., lag_2 = lag_1
                for lag in reversed(range(2, na + 1)):
                    x_row_next[f"{col}_lag_{lag}"] = x_row[f"{col}_lag_{lag - 1}"]
                # Set lag_1 to current prediction
                x_row_next[f"{col}_lag_1"] = pred[output_cols.index(col)]

            x_row = x_row_next.copy()

    y_pred_df = pd.DataFrame(y_pred, columns=output_cols)
    x_used_df = pd.DataFrame(x_used_rows)

    return y_pred_df, x_used_df


In [None]:

import pandas as pd
import numpy as np

def build_arx_lagged_with_scalers(
    df,
    input_cols,
    output_cols,
    scaler_X_func,
    scaler_y_func,
    na=0,
    nb_past=0,
    nf_future=0,
    test_name_col='test_name',
    y_initial_mode='original'  # 'original' ➔ normal; 'zero' ➔ prepend zero rows after scaling
):
    """
    Build ARX lagged data from a single DataFrame using custom scaler functions.
    
    Parameters:
    - df: original DataFrame (raw, unscaled) with test_name column
    - input_cols: list of input column names to scale
    - output_cols: list of output column names to scale
    - scaler_X_func: function or fitted scaler to scale inputs (df ➔ df)
    - scaler_y_func: function or fitted scaler to scale outputs (df ➔ df)
    - na: number of output lags (autoregressive)
    - nb_past: number of past input lags (exogenous input past)
    - nf_future: number of future input lags (exogenous input preview/future)
    - test_name_col: column name that identifies test cases
    - y_initial_mode: 'original' ➔ normal; 'zero' ➔ prepend zero rows after scaling

    Returns:
    - X_lagged_df: lagged input DataFrame
    - y_target_df: target output DataFrame
    - y_initial_df: DataFrame of initial output values used as initial conditions
    """

    X_df_list = []
    Y_df_list = []
    Y_initial_list = []

    # Unique test cases
    test_names = df[test_name_col].unique()

    for test in test_names:
        df_test = df[df[test_name_col] == test].copy()

        # Apply scaling functions first
        X_scaled = scaler_X_func(df_test[input_cols])
        y_scaled = scaler_y_func(df_test[output_cols])

        # Convert to DataFrames and keep test_name column for tracking
        X_scaled_df = pd.DataFrame(X_scaled, columns=input_cols)
        y_scaled_df = pd.DataFrame(y_scaled, columns=output_cols)

        if y_initial_mode == 'zero':
            # Create zero rows in scaled space
            lag_required = max(na, nb_past)

            zero_inputs = pd.DataFrame(
                np.zeros((lag_required, len(input_cols))),
                columns=input_cols
            )

            zero_outputs = pd.DataFrame(
                np.zeros((lag_required, len(output_cols))),
                columns=output_cols
            )

            # Concatenate zero inputs/outputs
            zero_rows_inputs = zero_inputs
            zero_rows_outputs = zero_outputs

            # Concatenate zero rows on top of X_scaled_df and y_scaled_df
            X_scaled_df = pd.concat([zero_rows_inputs, X_scaled_df], ignore_index=True)
            y_scaled_df = pd.concat([zero_rows_outputs, y_scaled_df], ignore_index=True)

        # Now proceed with lag creation
        inputs = X_scaled_df.values
        outputs = y_scaled_df.values

        n_samples = len(X_scaled_df)

        lag_required = max(na, nb_past)
        min_future_offset = nf_future

       
        # ➤ Save the first `na` rows as y_initial
        if y_initial_mode == 'zero':
            start_idx = lag_required
        else:  # 'original'
            start_idx = 0

        end_idx = n_samples - nf_future
        
        if na > 0:
            if y_initial_mode == 'original':
               
                y_initial = y_scaled_df.iloc[:lag_required].copy()
            else:  # 'zero'
                y_initial= pd.DataFrame(np.zeros((na, len(output_cols))), columns=output_cols)
               
                
            y_initial[test_name_col] = test
            Y_initial_list.append(y_initial)

        X_rows = []
        Y_rows = []

        for t in range(start_idx, end_idx):
            row = {}
            
            #Add current unscaled input values
            for i, in_col in enumerate(input_cols):
                row[in_col] = inputs[t, i]  # Current time step value

            # Add output past lags (autoregressive)
            for lag in range(1, na + 1):
                for i, out_col in enumerate(output_cols):
                    idx = t - lag
                    if y_initial_mode == 'zero' and idx < 0:
                        row[f'{out_col}_lag_{lag}'] = 0.0
                    else:
                        row[f'{out_col}_lag_{lag}'] = outputs[idx, i]
                            
            # Add input past lags (exogenous)
            for lag in range(1, nb_past + 1):
                for i, in_col in enumerate(input_cols):
                    idx = t - lag
                    if y_initial_mode == 'zero' and idx < 0:
                        row[f'{in_col}_past_{lag}'] = 0.0
                    else:
                        row[f'{in_col}_past_{lag}'] = inputs[idx, i]

            # Add input future lags (preview control)
            for lag in range(1, nf_future + 1):
                if t + lag < n_samples:
                    for i, in_col in enumerate(input_cols):
                        row[f'{in_col}_future_{lag}'] = inputs[t + lag, i]
                else:
                    for in_col in input_cols:
                        row[f'{in_col}_future_{lag}'] = 0.0  # optional padding if beyond end

            X_rows.append(row)
            Y_rows.append(outputs[t])

        # Create DataFrames for this test
        X_df = pd.DataFrame(X_rows)
        Y_df = pd.DataFrame(Y_rows, columns=output_cols)

        # Add test_name to track
        X_df[test_name_col] = test
        Y_df[test_name_col] = test

        X_df_list.append(X_df)
        Y_df_list.append(Y_df)

    # Combine all test cases into final DataFrames
    X_lagged_df = pd.concat(X_df_list, ignore_index=True)
    y_target_df = pd.concat(Y_df_list, ignore_index=True)

    if na > 0:
        y_initial_df = pd.concat(Y_initial_list, ignore_index=True)
    else:
        y_initial_df = pd.DataFrame()

    return X_lagged_df, y_target_df, y_initial_df


# Loading data

In [None]:
# load data
df_train_full = pd.read_csv('prepared_data/train_data.csv')
df_val_full = pd.read_csv('prepared_data/val_data.csv')
df_test_full = pd.read_csv('prepared_data/test_data.csv')

print(df_train_full.head())
print(df_val_full.head())
print(df_test_full.head())


In [None]:
# define test case
case='Tp6p8s_Hs2m'
df_case_train = df_train_full[df_train_full['test_name'] == case].copy()
df_case_val = df_val_full[df_val_full['test_name'] == case].copy()

# scalling Data

In [None]:

# Initialize scalers
scaler_X = MinMaxScaler(feature_range=(-1, 1))
scaler_y = MinMaxScaler(feature_range=(-1, 1))

# Feature and target columns
input_cols = ['eta']
output_cols = ['heave', 'pitch', 'pendulum']

# Extract training data as DataFrames
X_train = df_train_full[input_cols]
y_train = df_train_full[output_cols]

# Fit scalers
scaler_X.fit(X_train)
scaler_y.fit(y_train)

# Transform all sets (keeps DataFrame structure)
X_train_scaled = pd.DataFrame(scaler_X.transform(X_train), columns=input_cols)
y_train_scaled = pd.DataFrame(scaler_y.transform(y_train), columns=output_cols)

X_val_scaled = pd.DataFrame(scaler_X.transform(df_val_full[input_cols]), columns=input_cols)
y_val_scaled = pd.DataFrame(scaler_y.transform(df_val_full[output_cols]), columns=output_cols)

X_test_scaled = pd.DataFrame(scaler_X.transform(df_test_full[input_cols]), columns=input_cols)
y_test_scaled = pd.DataFrame(scaler_y.transform(df_test_full[output_cols]), columns=output_cols)


In [None]:
# Define the transformation functions using DataFrame input/output
scaler_X_func = lambda df: pd.DataFrame(scaler_X.transform(df[input_cols]), columns=input_cols)
scaler_y_func = lambda df: pd.DataFrame(scaler_y.transform(df[output_cols]), columns=output_cols)


In [None]:
# Initialize scalers
scaler_X_vel = MinMaxScaler(feature_range=(-1, 1))


# Feature and target columns
input_cols = ['eta','eta_velocity']


# Extract training data as DataFrames
X_train = df_train_full[input_cols]


# Fit scalers
scaler_X_vel.fit(X_train)

# Define the transformation functions using DataFrame input/output
scaler_X_func_vel = lambda df: pd.DataFrame(scaler_X_vel.transform(df[input_cols]), columns=input_cols)



In [None]:
# Initialize scalers
scaler_X_all = MinMaxScaler(feature_range=(-1, 1))


# Feature and target columns
input_cols = ['eta','eta_velocity','eta_acceleration']


# Extract training data as DataFrames
X_train = df_train_full[input_cols]


# Fit scalers
scaler_X_all.fit(X_train)

# Define the transformation functions using DataFrame input/output
scaler_X_func_all = lambda df: pd.DataFrame(scaler_X_all.transform(df[input_cols]), columns=input_cols)



In [None]:
output_cols = ['heave']
# Initialize scalers
scaler_y_heave = MinMaxScaler(feature_range=(-1, 1))


scaler_y_heave.fit(df_train_full[output_cols])

# define scalling function

scaler_y_func_heave = lambda df: pd.DataFrame(scaler_y_heave.transform(df[output_cols]),columns=output_cols)

In [None]:
output_cols = ['pitch']
# Initialize scalers

scaler_y_pitch = MinMaxScaler(feature_range=(-1, 1))

# Fit on training data (DataFrames, not NumPy arrays)

scaler_y_pitch.fit(df_train_full[output_cols])

# define scalling function

scaler_y_func_pitch = lambda df: pd.DataFrame(scaler_y_pitch.transform(df[output_cols]),columns=output_cols)

In [None]:
output_cols = ['pendulum']
# Initialize scalers

scaler_y_pend = MinMaxScaler(feature_range=(-1, 1))

# Fit on training data (DataFrames, not NumPy arrays)

scaler_y_pend.fit(df_train_full[output_cols])

# define scalling function

scaler_y_func_pend = lambda df: pd.DataFrame(scaler_y_pend.transform(df[output_cols]),columns=output_cols)

# Hyperparameters Optimization


In [None]:
import os
# Base save folder
save_folder = 'Xgboost/XGBregressor'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

In [None]:
# define the objective function for Optuna
# This function will be called by Optuna to evaluate the model performance
def objective_0(trial):
    params = {
        'n_estimators':trial.suggest_int('n_estimators', 10, 300),  
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'reg:squarederror',
        'gamma': trial.suggest_float('gamma', 0, 5),
        'random_state': 42,
        'tree_method': 'auto'
    }

    tscv = TimeSeriesSplit(n_splits=3)
    scores = []
    
    # Keep track of the actual number of trees used after early stopping
    trees_used = 0



    # Model with early stopping
    model = XGBRegressor(**params)
    model.fit(
        X_train, y_train_target,
        verbose=False,
    )

            
    preds = model.predict(X_train)
    score = r2_score(y_train_target, preds)
    scores.append(score)



    # Add complexity penalties for both tree depth and number of trees
    depth_penalty = 0.0005 * params['max_depth']  # Penalty factor for tree depth
    tree_penalty = 0.000005 * params['n_estimators']  # Penalty factor for number of trees
    
    # Combine penalties
    complexity_penalty = depth_penalty + tree_penalty
    
    return score - complexity_penalty

We will Find the Optimal Hyperparametrs per DoF across diffrent input features scenarios.

## 1- Heave

In [None]:
# Define the input, output columns, and lags
output_cols=['heave']
input_cols=['eta','eta_velocity','eta_acceleration']

na=2
nb=0
nf=0


In [None]:
# Build lagged data with scalers for training and validation sets
# This will create lagged features and apply the scalers to the input and output columns

dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
        df = df_case_train,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func_heave,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)
X_train=dfx_train.drop(columns='test_name')
y_train_target = dfy_train[output_cols].reset_index(drop=True)

dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
        df = df_case_val,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func_heave,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)



In [None]:
# Call optuna to optimize the hyperparameters
# Create a study object with a pruner

study_heave_0 = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=100)
) ## Don't prune the first 5 trials (give Optuna some data first) ,  Don't prune trials until at least 5 steps (e.g., boosting rounds) are completed

study_heave_0.optimize(objective_0, n_trials=250, timeout=600)  # 250 trials or 10 min
print("Best trial:")
print("  Value (R²):", study_heave_0.best_value)
print("  Params:", study_heave_0.best_params)


## 2- Pitch


In [None]:
# Define the input, output columns, and lags
output_cols=['pitch']
input_cols=['eta','eta_velocity','eta_acceleration']                             

na=2
nb=0
nf=0


In [None]:
# Build lagged data with scalers for training and validation sets
# This will create lagged features and apply the scalers to the input and output columns

dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
        df = df_case_train,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func_pitch,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)
    
y_train_target = dfy_train[output_cols].reset_index(drop=True)

dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
        df = df_case_val,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func_pitch,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)



In [None]:
# Call optuna to optimize the hyperparameters
# Create a study object with a pruner
study_pitch = optuna.create_study( 
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=100)
) ## Don't prune the first 5 trials (give Optuna some data first) ,  Don't prune trials until at least 5 steps (e.g., boosting rounds) are completed
study_pitch.optimize(objective_0, n_trials=250, timeout=600)  # 250 trials or 10 min
print("Best trial:")
print("  Value (R²):", study_pitch.best_value)
print("  Params:", study_pitch.best_params)

## 3- Pendulum

In [None]:
# Define the input, output columns, and lags
output_cols=['pendulum']
input_cols=['eta','eta_velocity','eta_acceleration']

na=2
nb=0
nf=0
                       

In [None]:
# Build lagged data with scalers for training and validation sets
# This will create lagged features and apply the scalers to the input and output columns
dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
        df = df_case_train,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func_pend,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)
    
y_train_target = dfy_train[output_cols].reset_index(drop=True)                                                                         

dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
        df = df_case_val,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func_pend,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)

                                                       

In [None]:
# Call optuna to optimize the hyperparameters
# Create a study object with a pruner
study_pendulum = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=100)
) ## Don't prune the first 5 trials (give Optuna some data first) ,  Don't prune trials until at least 5 steps (e.g., boosting rounds) are completed

study_pendulum.optimize(objective_0, n_trials=250, timeout=600)  # 250 trials or 10 min
print("Best trial:")
print("  Value (R²):", study_pendulum.best_value)
print("  Params:", study_pendulum.best_params)

-Saving Studies, and calculating the Avrage of each hyperprameter values for Multivariate model

In [None]:
# Save the best parameters for each study

best_params_dict = {
    'heave': study_heave_0.best_params,
    'pitch': study_pitch.best_params,
    'pendulum': study_pendulum.best_params
}

with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "w") as f:
    json.dump(best_params_dict, f, indent=4)


In [None]:
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params)

In [None]:
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
averaged_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", averaged_params)


# Choosing Best Input Feature case

-Hyperparameters are Loaded according to the desired case.

## Multivariate Model

In [None]:
# Load The hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params)

In [None]:
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
xgb_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", xgb_params)


In [None]:
# Create the base model
base_model = XGBRegressor(**xgb_params)

# Wrap it for multi-output regression
multi_output_model = MultiOutputRegressor(base_model)

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2 = pd.DataFrame()

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave', 'pitch',  'pendulum'] 
na_max  = [2]
nb_max= [0,1,2,3,4,5]
nf_max= [0,1,2,3,4,5]


No version = parameters from $/eta$ only study,
ver2 = parameters from corrosponding study

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models/3dof/check_eta_Vel_acc'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

print(input_cols)

# loop over lags 
for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_3dof_eta_vel_acc_ver2'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the model
        model = MultiOutputRegressor(base_model)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
       

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all,
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        
        # get feature names
        X_feature_names = X_train.columns.tolist()

        # Fit the model on the training data
        print(f'-----Training model----')

        model.fit(X_train  ,y_train_target )


        print(f'-----predicting----')

        # Predict on train and validation data
       
        y_pred_train_scaled=model.predict(X_train)
        
        y_pred_val_scaled=model.predict(X_val)       
        
        # Inverse transform to original scale
        y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y.inverse_transform(y_pred_val_scaled)

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2 = pd.concat([metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2, pd.DataFrame([metrics_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path)   




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/3dof"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2.csv")
# Save the DataFrame to a CSV file
metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2.to_csv(file_path, index=False)

In [None]:
# Load The saved metrics DataFrame
metrics_df_Xgboost_3dof_ckeck_eta_only= pd.read_csv('Xgboost/metrics_outpus/3dof/metrics_df_Xgboost_3dof_ckeck_eta_only.csv')

metrics_df_Xgboost_3dof_ckeck_eta_vel_ver2= pd.read_csv('Xgboost/metrics_outpus/3dof/metrics_df_Xgboost_3dof_ckeck_eta_vel_ver2.csv')
metrics_df_Xgboost_3dof_ckeck_eta_vel= pd.read_csv('Xgboost/metrics_outpus/3dof/metrics_df_Xgboost_3dof_ckeck_eta_vel.csv')

metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2= pd.read_csv('Xgboost/metrics_outpus/3dof/metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2.csv')
metrics_df_Xgboost_3dof_ckeck_eta_vel_acc= pd.read_csv('Xgboost/metrics_outpus/3dof/metrics_df_Xgboost_3dof_ckeck_eta_vel_acc.csv')


In [None]:
# plotting for case of hyperparameters from corrosponding study

target_values= ['heave', 'pitch', 'pendulum']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_3dof_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_3dof_ckeck_eta_vel_ver2.copy()
    df_plot_3 = metrics_df_Xgboost_3dof_ckeck_eta_vel_acc_ver2.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Multivariate Model Using η Only Tuned Hyperparameters (na = {na_val})",

            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )
        
            

         # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/3dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)


        fig.show()


In [None]:
# plotting for case of hyperparameters from eta only study

target_values= ['heave', 'pitch', 'pendulum']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_3dof_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_3dof_ckeck_eta_vel.copy()
    df_plot_3 = metrics_df_Xgboost_3dof_ckeck_eta_vel_acc.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Multivariate Model Using Corresponding Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )

        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
        

        # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/3dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}_ver2.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)


        fig.show()


## Uni-variate Model

### 1-Heave

In [None]:
# Load The hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params['heave'])

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave'] 
na_max  = [2]
nb_max= [0,1,2,3,4,5,6,7]
nf_max= [0,1,2,3,4,5,6,7]

best_params = loaded_params['heave']

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2 = pd.DataFrame()

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models/heave/check_eta_vel__acc_ver2'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

print(input_cols)

# loop over lags 
for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_heave_only_eta_vel_acc_ver2'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the model
        model = XGBRegressor(**best_params)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_heave,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
       

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all,
                scaler_y_func   = scaler_y_func_heave,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        
        # get feature names
        X_feature_names = X_train.columns.tolist()

        # Fit the model on the training data
        print(f'-----Training model----')

        model.fit(X_train  ,y_train_target )


        print(f'-----predicting----')

        # Predict on train and validation data
       
        y_pred_train_scaled=model.predict(X_train).reshape(-1, 1)
        
        y_pred_val_scaled=model.predict(X_val).reshape(-1, 1)       
        
        # Inverse transform to original scale
        y_pred_train = scaler_y_heave.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y_heave.inverse_transform(y_pred_val_scaled)

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y_heave.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y_heave.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2 = pd.concat([metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2, pd.DataFrame([metrics_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path)   




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/heave"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2.csv")
# Save the DataFrame to a CSV file
metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2.to_csv(file_path, index=False)

In [None]:
# Load The saved metrics DataFrame

metrics_df_Xgboost_heave_ckeck_eta_only= pd.read_csv('Xgboost/metrics_outpus/heave/metrics_df_Xgboost_heave_ckeck_eta_only.csv')

metrics_df_Xgboost_heave_ckeck_eta_vel= pd.read_csv('Xgboost/metrics_outpus/heave/metrics_df_Xgboost_heave_ckeck_eta_vel.csv')
metrics_df_Xgboost_heave_ckeck_eta_vel_ver2= pd.read_csv('Xgboost/metrics_outpus/heave/metrics_df_Xgboost_heave_ckeck_eta_vel_ver2.csv')

metrics_df_Xgboost_heave_ckeck_eta_vel_acc= pd.read_csv('Xgboost/metrics_outpus/heave/metrics_df_Xgboost_heave_ckeck_eta_vel_acc.csv')
metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2= pd.read_csv('Xgboost/metrics_outpus/heave/metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2.csv')

In [None]:
# plotting for case of hyperparameters from eta only study
target_values= ['heave']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_heave_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_heave_ckeck_eta_vel.copy()
    df_plot_3 = metrics_df_Xgboost_heave_ckeck_eta_vel_acc.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Uni-variate Model Using η Only Tuned Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )
        


        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
        # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/1dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)

        fig.show()


In [None]:
# plotting for case of hyperparameters from corresponding study

target_values= ['heave']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_heave_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_heave_ckeck_eta_vel_ver2.copy()
    df_plot_3 = metrics_df_Xgboost_heave_ckeck_eta_vel_acc_ver2.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Uni-variate Model Using Corresponding Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )
        
    
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
        # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/1dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}_ver2.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)
        fig.show()


### 2-Pitch

In [None]:
# Load The hyperparameters from the JSON file

with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params['pitch'])

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['pitch'] 
na_max  = [2]
nb_max= [0,1,2,3,4,5,6,7]
nf_max= [0,1,2,3,4,5,6,7]

best_params = loaded_params['pitch']

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2 = pd.DataFrame()

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models/pich/check_eta_vel_acc_ver2'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

print(input_cols)

# loop over lags 
for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_pitch_only_eta_vel_acc_ver2'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the model
        model = XGBRegressor(**best_params)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_pitch,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
       

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all,
                scaler_y_func   = scaler_y_func_pitch,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        
        # get feature names
        X_feature_names = X_train.columns.tolist()

        # Fit the model on the training data
        print(f'-----Training model----')

        model.fit(X_train  ,y_train_target )


        print(f'-----predicting----')

        # Predict on train and validation data
       
        y_pred_train_scaled=model.predict(X_train).reshape(-1, 1)
        
        y_pred_val_scaled=model.predict(X_val).reshape(-1, 1)       
        
        # Inverse transform to original scale
        y_pred_train = scaler_y_pitch.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y_pitch.inverse_transform(y_pred_val_scaled)

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y_pitch.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y_pitch.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2 = pd.concat([metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2, pd.DataFrame([metrics_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path)   




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/pitch"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2.csv")
# Save the DataFrame to a CSV file
metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2.to_csv(file_path, index=False)

In [None]:
# Load The saved metrics DataFrame

metrics_df_Xgboost_pitch_ckeck_eta_only= pd.read_csv("Xgboost/metrics_outpus/pitch/metrics_df_Xgboost_pitch_ckeck_eta_only.csv")

metrics_df_Xgboost_pitch_ckeck_eta_vel= pd.read_csv("Xgboost/metrics_outpus/pitch/metrics_df_Xgboost_pitch_ckeck_eta_vel.csv")

metrics_df_Xgboost_pitch_ckeck_eta_vel_acc= pd.read_csv("Xgboost/metrics_outpus/pitch/metrics_df_Xgboost_pitch_ckeck_eta_vel_acc.csv")

metrics_df_Xgboost_pitch_ckeck_eta_Vel_ver2= pd.read_csv("Xgboost/metrics_outpus/pitch/metrics_df_Xgboost_pitch_ckeck_eta_vel_ver2.csv")

metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2= pd.read_csv("Xgboost/metrics_outpus/pitch/metrics_df_Xgboost_pitch_ckeck_eta_vel_acc_ver2.csv")
    


In [None]:
# plotting for case of hyperparameters from eta only study
target_values= ['pitch']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_pitch_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_pitch_ckeck_eta_vel.copy()
    df_plot_3 = metrics_df_Xgboost_pitch_ckeck_eta_vel_acc.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Uni-variate Model Using η Only Tuned Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )

        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
        
        fig.update_yaxes(range=[0.994, 1], row=1, col=1)
        fig.update_yaxes(range=[0.994, 1], row=1, col=2)
        fig.update_yaxes(range=[0.994, 1], row=1, col=3)
        
        # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/1dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)

        fig.show()


In [None]:
# plotting for case of hyperparameters from corresponding study

target_values= ['pitch']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_pitch_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_pitch_ckeck_eta_Vel_ver2.copy()
    df_plot_3 = metrics_df_Xgboost_pitch_ckeck_eta_Vel_acc_ver2.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Uni-variate Model Using Corresponding Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )

        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
        
            # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/1dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}_ver2.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)


        fig.show()


### 3-Pendulum

In [None]:
# Load The hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel/best_params_eta_Vel.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params['pendulum'])

In [None]:
# define loop values
input_cols=['eta','eta_velocity']
output_cols=['pendulum'] 
na_max  = [2]
nb_max= [0,1,2,3,4,5,6,7]
nf_max= [0,1,2,3,4,5,6,7]

best_params = loaded_params['pendulum']

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2 = pd.DataFrame()

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models/pendulum/check_eta_vel_ver2'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

print(input_cols)

# loop over lags 
for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_pendulum_only_eta_vel_ver2'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the model
        model = XGBRegressor(**best_params)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_vel ,
                scaler_y_func   = scaler_y_func_pend,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
       

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_vel ,
                scaler_y_func   = scaler_y_func_pend,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        
        # get feature names
        X_feature_names = X_train.columns.tolist()

        # Fit the model on the training data
        print(f'-----Training model----')

        model.fit(X_train  ,y_train_target )


        print(f'-----predicting----')

        # Predict on train and validation data
       
        y_pred_train_scaled=model.predict(X_train).reshape(-1, 1)
        
        y_pred_val_scaled=model.predict(X_val).reshape(-1, 1)       
        
        # Inverse transform to original scale
        y_pred_train = scaler_y_pend.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y_pend.inverse_transform(y_pred_val_scaled)

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y_pend.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y_pend.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2 = pd.concat([metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2, pd.DataFrame([metrics_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path)   




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/pendulum"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2.csv")
# Save the DataFrame to a CSV file
metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2.to_csv(file_path, index=False)

In [None]:
# Load The saved metrics DataFrame

metrics_df_Xgboost_pendulum_ckeck_eta_only= pd.read_csv("Xgboost/metrics_outpus/pendulum/metrics_df_Xgboost_pendulum_ckeck_eta_only.csv")
metrics_df_Xgboost_pendulum_ckeck_eta_vel= pd.read_csv("Xgboost/metrics_outpus/pendulum/metrics_df_Xgboost_pendulum_ckeck_eta_vel.csv")
metrics_df_Xgboost_pendulum_ckeck_eta_vel_acc= pd.read_csv("Xgboost/metrics_outpus/pendulum/metrics_df_Xgboost_pendulum_ckeck_eta_vel_acc.csv")
metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2= pd.read_csv("Xgboost/metrics_outpus/pendulum/metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2.csv")
metrics_df_Xgboost_pendulum_ckeck_eta_vel_acc_ver2= pd.read_csv("Xgboost/metrics_outpus/pendulum/metrics_df_Xgboost_pendulum_ckeck_eta_vel_acc_ver2.csv")

In [None]:
# plotting for case of hyperparameters from eta only study

target_values= ['pendulum']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_pendulum_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_pendulum_ckeck_eta_vel.copy()
    df_plot_3 = metrics_df_Xgboost_pendulum_ckeck_eta_vel_acc.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Uni-variate Model Using η Only Tuned Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )

        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
        
         # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/1dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)

        fig.show()


In [None]:
# plotting for case of hyperparameters from corresponding study
target_values= ['pendulum']

# Set target output variable
for target_var in  target_values : # Change this to 'heave', 'pitch', or 'pendulum' as needed
    
    df_plot_1 = metrics_df_Xgboost_pendulum_ckeck_eta_only.copy()
    df_plot_2 = metrics_df_Xgboost_pendulum_ckeck_eta_vel_ver2.copy()
    df_plot_3 = metrics_df_Xgboost_pendulum_ckeck_eta_vel_acc_ver2.copy()
    
    # Unique values
    na_values = [2]
    nb_values = sorted(df_plot_1['nb'].unique())

    # Color map to keep consistent colors for each nb
    color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(nb_values)}

    for na_val in na_values:
        df_na_1 = df_plot_1[df_plot_1['na'] == na_val].copy()
        df_na_2 = df_plot_2[df_plot_2['na'] == na_val].copy()
        df_na_3 = df_plot_3[df_plot_3['na'] == na_val].copy()

        fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=[
        f"R² Val: η only (na={na_val})", 
        f"R² Val: η + ẋ (na={na_val})", 
        f"R² Val: η + ẋ + ẍ (na={na_val})"
        ],
        shared_yaxes=True
        )


        for nb_val in nb_values:
            df_nb_1 = df_na_1[df_na_1['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_2 = df_na_2[df_na_2['nb'] == nb_val].copy().sort_values(by='nf')
            df_nb_3 = df_na_3[df_na_3['nb'] == nb_val].copy().sort_values(by='nf')
            
            color = color_map[nb_val]

            # Add val R² for eta only
            fig.add_trace(
                go.Scatter(
                    x=df_nb_1['nf'],
                    y=df_nb_1[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R² for eta + vel
            fig.add_trace(
                go.Scatter(
                    x=df_nb_2['nf'],
                    y=df_nb_2[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )
            
            # Add val R² for eta + vel + acc
            fig.add_trace(
                go.Scatter(
                    x=df_nb_3['nf'],
                    y=df_nb_3[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nb={nb_val}',
                    legendgroup=f'nb={nb_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=3
            )
           

        fig.update_layout(
            title_text = f"{target_var} Validation R² for the Uni-variate Model Using Corresponding Hyperparameters (na = {na_val})",
            height=500,
            width=1100,
            template="plotly_white",
            legend_title="nb (Input Lag)"
            )

        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=1)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=2)
        fig.update_xaxes(title_text="nf (Disturbance Lags)", row=1, col=3)
        fig.update_yaxes(title_text="R² Score", row=1, col=1)
         # Create the directory if it doesn't exist
        save_dir = "Results/XGBoostHyper/1dof"
        os.makedirs(save_dir, exist_ok=True)

        # Define filename
        filename = f"{target_var}_ver2.png"
        save_path = os.path.join(save_dir, filename)

        # Save figure with same width & height
        pio.write_image(fig, save_path, format='png', width=1100, height=500)
        fig.show()


# Multivariate Model

## Finding Optimal Lags

According to Desired input feature case, Input_cols and the x scale function are adjusted.

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_3dof = pd.DataFrame()
perf_df_Xgboost_3dof = pd.DataFrame()


In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave', 'pitch',  'pendulum'] 
na_max  = [2]
nb_max= [0,1,2,3,4,5,6,7,8]
nf_max= [0,1,2,3,4,5,6,7,8,9,10,11]


In [None]:
# Load The hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)


print("Loaded params:", loaded_params)
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
xgb_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", xgb_params)


Predict only 1st 100000 data points in train data to make it faster

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models/3dof/ver2_extra'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# loop over lags 

for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_3dof_ver2_extra'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the model
        model = MultiOutputRegressor(base_model)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all, 
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
       

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all,
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        # ============================
        # MEASURE CPU & MEMORY USAGE
        # ============================
        print(f'-----Training model----')
        process = psutil.Process()

        # Train the model
        start_time = time.perf_counter()
        model.fit(X_train  ,y_train_target )
        train_time = time.perf_counter() - start_time
        memory_usage_train = process.memory_info().rss / (1024 * 1024)  # MB


        # get feature names
        X_feature_names = X_train.columns.tolist()


        print(f'-----predicting on training data----')

        # Predict on train and validation data
        start_time = time.perf_counter()

        y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )

        print(f'-----predicting on validation data----')
        
        y_pred_val_scaled , x_used_val=predict_recursive_series(
        model=model,
        X_df=X_val,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )



        # Inverse transform to original scale
        y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y.inverse_transform(y_pred_val_scaled)


        predict_time = time.perf_counter() - start_time
        memory_usage_predict = process.memory_info().rss / (1024 * 1024)  # MB

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        perf_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'train_time': train_time,
        'train_memory_MB': memory_usage_train,
        'predict_time': predict_time,
        'predict_memory_MB': memory_usage_predict
            }
        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        print("metrics_row:", metrics_row)
        print("perf_row:", perf_row)
        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_3dof = pd.concat([metrics_df_Xgboost_3dof, pd.DataFrame([metrics_row])], ignore_index=True)
        perf_df_Xgboost_3dof = pd.concat([perf_df_Xgboost_3dof, pd.DataFrame([perf_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path)   




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/3dof"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_3dof.csv")

# Save the DataFrame
metrics_df_Xgboost_3dof.to_csv(file_path, index=False)

print(f"Saved metrics_df_Xgboost_3dof to: {file_path}")

# Define the file name
file_path = os.path.join(save_folder, "perf_df_Xgboost_3dof.csv")

# Save the DataFrame
perf_df_Xgboost_3dof.to_csv(file_path, index=False)

print(f"Saved perf_df_Xgboost_3dof to: {file_path}")

### Plotting Data From Remote Server

ver 1= eta only input
ver 2 = eta + eta vel + eta acc

In [None]:
# Load The saved metrics DataFrame
metrics_df_Xgboost_3dof_ver_1=pd.read_csv('Xgboost/R_M/metrics_outpus/3dof/ver1/metrics_df_Xgboost_3dof_ver1.csv')
metrics_df_Xgboost_3dof_ver_2=pd.read_csv('Xgboost/R_M/metrics_outpus/3dof/ver2/metrics_df_Xgboost_3dof_ver2.csv')
metrics_df_Xgboost_3dof_ver_2_extra=pd.read_csv('Xgboost/R_M/metrics_outpus/3dof/ver2/metrics_df_Xgboost_3dof_ver2_extra.csv')
metrics_df_Xgboost_3dof_ver_2_extra_2=pd.read_csv('Xgboost/R_M/metrics_outpus/3dof/ver2/metrics_df_Xgboost_3dof_ver2_extra_2.csv')
metrics_df_Xgboost_3dof_ver_2_final=pd.concat([metrics_df_Xgboost_3dof_ver_2, metrics_df_Xgboost_3dof_ver_2_extra, metrics_df_Xgboost_3dof_ver_2_extra_2], ignore_index=True)

In [None]:
# plotting for case on eta + eta vel + eta acc as input features

target_values = ['heave', 'pitch', 'pendulum']

for target_var in target_values:
    df_plot = metrics_df_Xgboost_3dof_ver_2_final[metrics_df_Xgboost_3dof_ver_2_final['na'] == 2].copy()

    na_values = sorted(df_plot['na'].unique())
    nf_values = sorted(df_plot['nf'].unique())  # Now nf is the legend
    nb_values = sorted(df_plot['nb'].unique())  # Now nb is the x-axis

    # Color map for each nf (now the legend)
    color_map = {nf: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nf in enumerate(nf_values)}
    color_map[10] = px.colors.qualitative.D3[8]  # just another predefined color
    color_map[11] = px.colors.qualitative.D3[3]  # just another predefined color

    for na_val in na_values:
        df_na = df_plot[df_plot['na'] == na_val].copy()

        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=[f"R² Train", f"R² Val "],
            shared_yaxes=False
        )

        for nf_val in nf_values:
            df_nf = df_na[df_na['nf'] == nf_val].copy().sort_values(by='nb')
            color = color_map[nf_val]

            # Add train R²
            fig.add_trace(
                go.Scatter(
                    x=df_nf['nb'],
                    y=df_nf[f'r2_train_{target_var}'],
                    mode='lines+markers',
                    name=f'nd={-nf_val}',
                    legendgroup=f'nd={-nf_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R²
            fig.add_trace(
                go.Scatter(
                    x=df_nf['nb'],
                    y=df_nf[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nd={-nf_val}',
                    legendgroup=f'nd={-nf_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )

        fig.update_layout(
            title_text = f"R² Performance of XGBoost Shared Model for All DOFs <br> η , η̇ and η̈ as Input Features, na = {na_val} ({target_var}) <br>",
            xaxis_title="nb ",
            xaxis2_title="nb",
            yaxis_title="R² Score",
            template="plotly_white",
            height=600,
            width=1350,
            legend_title="nd",
            #yaxis=dict(range=[0, 1.05])
        )

        import os

        # Define output directory and filename
        output_dir = "Xgboost/final_plots/3dof_ver2"
        os.makedirs(output_dir, exist_ok=True)

        filename = f"Validation_R2_eta_only_na{na_val}_{target_var}_ver2.png"
        save_path = os.path.join(output_dir, filename)

        # Save the figure
        fig.write_image(save_path, scale=2)  # scale=2 for higher resolution

        fig.show()


In [None]:
# plotting in case of eta only as an input feature

target_values = ['heave', 'pitch', 'pendulum']

for target_var in target_values:
    df_plot = metrics_df_Xgboost_3dof_ver_1[metrics_df_Xgboost_3dof_ver_1['na'] == 2].copy()

    na_values = sorted(df_plot['na'].unique())
    nf_values = sorted(df_plot['nf'].unique())  # Now nf is the legend
    nb_values = sorted(df_plot['nb'].unique())  # Now nb is the x-axis

    # Color map for each nf (now the legend)
    color_map = {nf: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nf in enumerate(nf_values)}
    color_map[10] = px.colors.qualitative.D3[8]  # just another predefined color
    color_map[11] = px.colors.qualitative.D3[3]  # just another predefined color

    for na_val in na_values:
        df_na = df_plot[df_plot['na'] == na_val].copy()

        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=[f"R² Train", f"R² Val "],
            shared_yaxes=False
        )

        for nf_val in nf_values:
            df_nf = df_na[df_na['nf'] == nf_val].copy().sort_values(by='nb')
            color = color_map[nf_val]

            # Add train R²
            fig.add_trace(
                go.Scatter(
                    x=df_nf['nb'],
                    y=df_nf[f'r2_train_{target_var}'],
                    mode='lines+markers',
                    name=f'nd={-nf_val}',
                    legendgroup=f'nd={-nf_val}',
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Add val R²
            fig.add_trace(
                go.Scatter(
                    x=df_nf['nb'],
                    y=df_nf[f'r2_val_{target_var}'],
                    mode='lines+markers',
                    name=f'nd={-nf_val}',
                    legendgroup=f'nd={-nf_val}',
                    showlegend=False,
                    line=dict(color=color)
                ),
                row=1, col=2
            )

        fig.update_layout(
            title_text = f"R² Performance of XGBoost Shared Model for All DOFs <br> η only as an nput Feature, na = {na_val} ({target_var}) <br>",
            xaxis_title="nb ",
            xaxis2_title="nb",
            yaxis_title="R² Score",
            template="plotly_white",
            height=600,
            width=1350,
            legend_title="nd",
            #yaxis=dict(range=[0, 1.05])
        )

        import os

        # Define output directory and filename
        output_dir = "Xgboost/final_plots/3dof_ver2"
        os.makedirs(output_dir, exist_ok=True)

        filename = f"Validation_R2_eta_only_na{na_val}_{target_var}_ver2.png"
        save_path = os.path.join(output_dir, filename)

        # Save the figure
        #fig.write_image(save_path, scale=2)  # scale=2 for higher resolution

        fig.show()


## loading models to calculate full R^2 score

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/R_M/saved_models/3dof/ver2/Best_combination'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model


In [None]:
# Initialize empty DataFrame for training metrics
train_metrics_df_Xgboost_3dof= pd.DataFrame()

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave', 'pitch',  'pendulum']
for fname, model in models.items():
    print(f"Filename: {fname}")
   # extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")
    
    # prepare training data
    dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
    
    y_train_target = dfy_train[output_cols].reset_index(drop=True)

    X_train=dfx_train.drop(columns='test_name')

    X_feature_names = X_train.columns.tolist()
    
    # predict on train data
    y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )
    
    # Inverse transform to original scale
    y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
    y_pred_train_df= pd.DataFrame(y_pred_train, columns=output_cols)
    y_true_train =scaler_y.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    # save model info to dfs
    metrics_row = {
    'na': na,
    'nb': nb,
    'nd': -1*nf,
            }
    
    for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

           
            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            
    train_metrics_df_Xgboost_3dof = pd.concat([train_metrics_df_Xgboost_3dof, pd.DataFrame([metrics_row])], ignore_index=True)


In [None]:
# save the training metrics DataFrame
train_metrics_df_Xgboost_3dof.to_csv('Xgboost/R_M/metrics_outpus/3dof/ver2/train_metrics_df_Xgboost_3dof_ver2.csv', index=False)

In [None]:
train_metrics_df_Xgboost_3dof

## Testing The selected Models

In [None]:
training_case='Tp6p8s_Hs2m'
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave', 'pitch',  'pendulum']

In [None]:
# Define the base folder where results will be saved
save_folder = 'XGboost/results/3dof_model/test_best_models'  # you can change this to any directory you want

# Ensure the save directory exists
os.makedirs(save_folder, exist_ok=True)

In [None]:
all_models_metrics = {}  # Store separate metrics DataFrames per model
y_true_pred_dict = {}    # Store true and predicted values per model and case

for fname, model in models.items():
    print(f"Filename: {fname}")
    
    # Extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")

    metrics_df_test_all = []  # Per-model metrics list
    model_y_true_pred = {}          # Per-model true/predicted dict

    cases = df_test_full['test_name'].unique()

    for case in cases:
        df_case_test = df_test_full[df_test_full['test_name'] == case].reset_index(drop=True)

        # Prepare the test data
        dfx_test, dfy_test, yi_test = build_arx_lagged_with_scalers(
            df=df_case_test,
            input_cols=input_cols,
            output_cols=output_cols,
            scaler_X_func=scaler_X_func_all,
            scaler_y_func=scaler_y_func,
            na=na,
            nb_past=nb,
            nf_future=nf,
            test_name_col='test_name',
            y_initial_mode='original'
        )

        X_test_selected_df = dfx_test.drop(columns='test_name')
        X_feature_names = X_test_selected_df.columns.tolist()

        # Predict
        y_pred_test_scaled, x_used_test = predict_recursive_series(
            model=model,
            X_df=X_test_selected_df,
            output_cols=output_cols,
            X_feature_names=X_feature_names,
            na=na
        )

        y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)
        y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)

        y_true_test = scaler_y.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
        y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)

        # Save predictions and ground truth
        model_y_true_pred[case] = {
            'y_true': y_true_test_df,
            'y_pred': y_pred_test_df
        }

        # Compute metrics
        metrics_test = {
            'model': fname,
            'na': na,
            'nb': nb,
            'nf': nf,
            'test case': case
        }

        if case == training_case:
            metrics_test['Comments'] = 'case used for training'

        for col in output_cols:
            mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
            r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

            metrics_test[f'r2_test_{col}'] = r2_test
            metrics_test[f'mse_test_{col}'] = mse_test

        metrics_df_test_all.append(metrics_test)

    # Store per-model data
    metrics_df_test_all = pd.DataFrame(metrics_df_test_all)
    all_models_metrics[fname] = metrics_df_test_all
    y_true_pred_dict[fname] = model_y_true_pred

    # Optionally display or save
    print(f"\nMetrics for model {fname}:")
    display(metrics_df_test_all)

   
    # Save predictions and true values if needed
    for case, data in model_y_true_pred.items():
        y_true_path = os.path.join(save_folder, f'{fname}_case_{case}_y_true.csv')
        y_pred_path = os.path.join(save_folder, f'{fname}_case_{case}_y_pred.csv')
        
        data['y_true'].to_csv(y_true_path, index=False)
        data['y_pred'].to_csv(y_pred_path, index=False)

In [None]:
# save metrics DataFrames for all models
for fname, metrics_df in all_models_metrics.items():
    metrics_path = os.path.join(save_folder, f"{fname}_metrics.csv")
    metrics_df.to_csv(metrics_path, index=False)


## dt Sensitivety 

Series configration

In [None]:
# Initialize empty DataFrame
metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new=pd.DataFrame()


In [None]:
# Load the hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params)
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
xgb_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", xgb_params)
# Create the base model
base_model = XGBRegressor(**xgb_params)
multi_output_model = MultiOutputRegressor(base_model)

In [None]:
# prepare training data
dfx_train_full,dfy_train_full,yi_train = build_arx_lagged_with_scalers(
        df = df_case_train,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)
    
y_train_target_full = dfy_train_full[output_cols].reset_index(drop=True)

X_train=dfx_train_full.drop(columns='test_name')



# prepare Testing data

dfx_test_full,dfy_test_full,yi_test = build_arx_lagged_with_scalers(
        df = df_case_test,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)

y_test_target = dfy_test_full[output_cols].reset_index(drop=True)

X_test=dfx_test_full.drop(columns='test_name')

    
for step in steps:
    
    dt=0.05*step
    
    
    # resample data
    
    X_train_selected_df= X_train.iloc[::step].reset_index(drop=True)
    y_train_target= y_train_target_full.iloc[::step].reset_index(drop=True)
    X_test_selected_df= X_test.iloc[::step].reset_index(drop=True)
    dfy_train= dfy_train_full.iloc[::step].reset_index(drop=True)
    dfy_test= dfy_test_full.iloc[::step].reset_index(drop=True)
    
    
    # Create the model
    model = MultiOutputRegressor(base_model)
        
   
    print(f'-----Training model----')
    
    model.fit(X_train_selected_df  ,y_train_target )
    


    # get feature names
    X_feature_names = X_train_selected_df.columns.tolist()


    print(f'-----predicting on training data----')

    # Predict on train and Testing data
    start_time = time.perf_counter()

    y_pred_train_scaled,x_used_train = predict_recursive_series(
    model=model,
    X_df=X_train_selected_df,
    output_cols=output_cols,
    X_feature_names=X_feature_names,
    na=na
        )

    print(f'-----predicting on Testing data----')
    
    y_pred_test_scaled , x_used_test=predict_recursive_series(
    model=model,
    X_df=X_test_selected_df,
    output_cols=output_cols,
    X_feature_names=X_feature_names,
    na=na
        )



    # Inverse transform to original scale
    y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)


   

    # Convert predictions to DataFrames for easier handling
    y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
    y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)


    # Get true values aligned with dfy_train and dfy_test indexes
    y_true_train =scaler_y.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    y_true_test = scaler_y.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
    y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)



    # save model info to dfs
    metrics_row = {
    'model_name': model_name,
    'na': na,
    'nb': nb,
    'nf': nf,
    'xcase': 3,
    'test case': case,
    'dt' : dt
    }

    
    print(f'-----Eavluating model----')
    # Compute metrics
    if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
        print("[WARN] Train predictions have NaNs or infs.")

    for col in output_cols:
        mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
        r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

        mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
        r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

        # Add to metrics row
        metrics_row[f'r2_train_{col}'] = r2_train
        metrics_row[f'mse_train_{col}'] = mse_train
        metrics_row[f'r2_test_{col}'] = r2_test
        metrics_row[f'mse_test_{col}'] = mse_test

    print("metrics_row:", metrics_row)
    # Append the row dictionaries as new rows in the DataFrames
    # Make sure metrics_row is a dict, not a list of dicts
    if isinstance(metrics_row, dict):
        row_df = pd.DataFrame([metrics_row])
    else:
        row_df = pd.DataFrame(metrics_row)

    metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new = pd.concat([metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new, row_df], ignore_index=True)



In [None]:
# Base save folder
save_folder = "Xgboost/metrics_outpus/3dof/dt_test-new"

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new.csv")

# Save the DataFrame
metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new.to_csv(file_path, index=False)

print(f"Saved metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new to: {file_path}")



parallel configration

In [None]:
# Initialize empty DataFrame for parallel processing results
metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new_parralel=pd.DataFrame()

In [None]:
steps=[1,2,3,4,5,6]
input_cols=['eta' , 'eta_velocity', 'eta_acceleration']
output_cols = ['heave','pitch','pendulum']
#for na in range(4,na_max+1):
na=2
nb=0
nf=10

In [None]:
# Load the hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params)
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
xgb_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", xgb_params)
# Create the base model
base_model = XGBRegressor(**xgb_params)
multi_output_model = MultiOutputRegressor(base_model)

In [None]:
# prepare training data
dfx_train_full,dfy_train_full,yi_train = build_arx_lagged_with_scalers(
        df = df_case_train,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)
    
y_train_target_full = dfy_train_full[output_cols].reset_index(drop=True)

X_train=dfx_train_full.drop(columns='test_name')



# prepare Testing data

dfx_test_full,dfy_test_full,yi_test = build_arx_lagged_with_scalers(
        df = df_case_test,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)

y_test_target = dfy_test_full[output_cols].reset_index(drop=True)

X_test=dfx_test_full.drop(columns='test_name')

    
for step in steps:
    
    dt=0.05*step
    
    
    # resample data
    
    X_train_selected_df= X_train.iloc[::step].reset_index(drop=True)
    y_train_target= y_train_target_full.iloc[::step].reset_index(drop=True)
    X_test_selected_df= X_test.iloc[::step].reset_index(drop=True)
    dfy_train= dfy_train_full.iloc[::step].reset_index(drop=True)
    dfy_test= dfy_test_full.iloc[::step].reset_index(drop=True)
    
    
    # Create the model
    model = MultiOutputRegressor(base_model)
        
  
    print(f'-----Training model----')
    
    model.fit(X_train_selected_df  ,y_train_target )
    


    # get feature names
    X_feature_names = X_train_selected_df.columns.tolist()


    print(f'-----predicting on training data----')

    # Predict on train and Testing data
    start_time = time.perf_counter()

    y_pred_train_scaled=model.predict(X_train_selected_df)

    print(f'-----predicting on Testing data----')
    
    y_pred_test_scaled=model.predict(X_test_selected_df)



    # Inverse transform to original scale
    y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)


   

    # Convert predictions to DataFrames for easier handling
    y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
    y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)


    # Get true values aligned with dfy_train and dfy_test indexes
    y_true_train =scaler_y.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    y_true_test = scaler_y.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
    y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)



    # save model info to dfs
    metrics_row = {
    'model_name': model_name,
    'na': na,
    'nb': nb,
    'nf': nf,
    'xcase': 3,
    'test case': case,
    'dt' : dt
    }

    
    print(f'-----Eavluating model----')
    # Compute metrics
    if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
        print("[WARN] Train predictions have NaNs or infs.")

    for col in output_cols:
        mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
        r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

        mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
        r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

        # Add to metrics row
        metrics_row[f'r2_train_{col}'] = r2_train
        metrics_row[f'mse_train_{col}'] = mse_train
        metrics_row[f'r2_test_{col}'] = r2_test
        metrics_row[f'mse_test_{col}'] = mse_test

    print("metrics_row:", metrics_row)
    # Append the row dictionaries as new rows in the DataFrames
    # Make sure metrics_row is a dict, not a list of dicts
    if isinstance(metrics_row, dict):
        row_df = pd.DataFrame([metrics_row])
    else:
        row_df = pd.DataFrame(metrics_row)

    metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new_parralel = pd.concat([metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new_parralel, row_df], ignore_index=True)

    


In [None]:
    # Base save folder
save_folder = "Xgboost/metrics_outpus/3dof/dt_test-new"

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new_parralel.csv")

# Save the DataFrame
metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new_parralel.to_csv(file_path, index=False)

print(f"Saved metrics_df_XGBOOST_3dof_dt_test_nb0_nf10_new_parralel to: {file_path}")


## Sensitivety to data size 


Parallel

In [None]:
# define the length of the series for each step
lenght=np.array([0.002,0.004,0.008,0.01 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9,1])*len(df_case_train)
lenght=lenght.astype(int)
# best model parameters
input_cols=['eta' , 'eta_velocity', 'eta_acceleration']
output_cols = ['heave','pitch','pendulum']
#for na in range(4,na_max+1):
na=2
nb=0
nf=10



In [None]:
# Load the hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params)
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
xgb_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", xgb_params)


In [None]:
# Create the base model
base_model = XGBRegressor(**xgb_params)

In [None]:
# prepare training data
dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
        df = df_case_train,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)
    
y_train_target = dfy_train[output_cols].reset_index(drop=True)

X_train_selected_df=dfx_train.drop(columns='test_name')

# prepare Testing data

dfx_test,dfy_test,yi_test = build_arx_lagged_with_scalers(
        df = df_case_test,
        input_cols  = input_cols,
        output_cols   = output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=na,
        nb_past=nb,
        nf_future=nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)

y_test_target = dfy_test[output_cols].reset_index(drop=True)

X_test_selected_df=dfx_test.drop(columns='test_name')

# get feature names
X_feature_names = X_train_selected_df.columns.tolist()


In [None]:
# Initialize empty DataFrame 
metrics_df_Xgboost_3dof_data_test_new_parralel=pd.DataFrame()

In [None]:

for l in lenght:
    
    # Create the model
    model = MultiOutputRegressor(base_model)
        
    # prepare training data 
    print(f'-----Preprocessing case of lenght {str(l)} ----')


    print(f'-----Training model----')
    
    model.fit(X_train_selected_df[0:l]  ,y_train_target[0:l] )
        

    print(f'-----predicting on training data----')

    # Predict on train and Testing data
    start_time = time.perf_counter()

    y_pred_train_scaled=model.predict(X_train_selected_df[0:l])

    print(f'-----predicting on Testing data----')
    
    y_pred_test_scaled = model.predict(X_test_selected_df)
   

    
    # Inverse transform to original scale
    y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)
    


   

    # Convert predictions to DataFrames for easier handling
    y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
    y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)
    

    

    # Get true values aligned with dfy_train and dfy_test indexes
    y_true_train =scaler_y.inverse_transform(dfy_train[output_cols][0:l].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    y_true_test = scaler_y.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
    y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)

    y_true_test_cut = scaler_y.inverse_transform(dfy_test[output_cols][0:l_test].reset_index(drop=True))
    y_true_test_df_cut = pd.DataFrame(y_true_test_cut, columns=output_cols)

    # save model info to dfs
    metrics_row = {
    'model_name': model_name,
    'na': na,
    'nb': nb,
    'nf': nf,
    'xcase': 3,
    'test case': case,
    'lenght' : l
    }

    
    print(f'-----Eavluating model----')
    # Compute metrics
    if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
        print("[WARN] Train predictions have NaNs or infs.")

    for col in output_cols:
        mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
        r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

        mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
        r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

        

        # Add to metrics row
        metrics_row[f'r2_train_{col}'] = r2_train
        metrics_row[f'mse_train_{col}'] = mse_train
        metrics_row[f'r2_test_{col}'] = r2_test
        metrics_row[f'mse_test_{col}'] = mse_test
        
       

    print("metrics_row:", metrics_row)
    # Append the row dictionaries as new rows in the DataFrames
    # Make sure metrics_row is a dict, not a list of dicts
    if isinstance(metrics_row, dict):
        row_df = pd.DataFrame([metrics_row])
    else:
        row_df = pd.DataFrame(metrics_row)

    metrics_df_Xgboost_3dof_data_test_new_parralel= pd.concat([metrics_df_Xgboost_3dof_data_test_new_parralel ,row_df], ignore_index=True)

    

In [None]:
# Base save folder
save_folder = "Xgboost/metrics_outpus/3dof/final"

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_3dof_data_test_new_parralel_ver2.csv")

# Save the DataFrame
metrics_df_Xgboost_3dof_data_test_new_parralel.to_csv(file_path, index=False)

print(f"Saved metrics_df_Xgboost_3dof_data_test_new_parralel to: {file_path}")

Series Configration

In [None]:
# Initialize empty DataFrame
metrics_df_Xgboost_3dof_data_test_series=pd.DataFrame()

In [None]:

for l in lenght:
    
    # Create the model
    model = MultiOutputRegressor(base_model)
        
    # prepare training data 
    print(f'-----Preprocessing case of lenght {str(l)} ----')
    # ============================
    # MEASURE CPU & MEMORY USAGE
    # ============================
    print(f'-----Training model----')
    
    model.fit(X_train_selected_df[0:l]  ,y_train_target[0:l] )
        

    print(f'-----predicting on training data----')

    # Predict on train and Testing data
    start_time = time.perf_counter()

    y_pred_train_scaled,x_used_train = predict_recursive_series(
    model=model,
    X_df=X_train_selected_df[0:l],
    output_cols=output_cols,
    X_feature_names=X_feature_names,
    na=na
        )

    print(f'-----predicting on Testing data----')
    
    y_pred_test_scaled , x_used_test=predict_recursive_series(
    model=model,
    X_df=X_test_selected_df,
    output_cols=output_cols,
    X_feature_names=X_feature_names,
    na=na
        )


    
    
    # Inverse transform to original scale
    y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled)
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)
    


   

    # Convert predictions to DataFrames for easier handling
    y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
    y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)
   

    

    # Get true values aligned with dfy_train and dfy_test indexes
    y_true_train =scaler_y.inverse_transform(dfy_train[output_cols][0:l].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    y_true_test = scaler_y.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
    y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)

    

    # save model info to dfs
    metrics_row = {
    'model_name': model_name,
    'na': na,
    'nb': nb,
    'nf': nf,
    'xcase': 3,
    'test case': case,
    'lenght' : l
    }

    
    print(f'-----Eavluating model----')
    # Compute metrics
    if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
        print("[WARN] Train predictions have NaNs or infs.")

    for col in output_cols:
        mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
        r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

        mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
        r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

        

        # Add to metrics row
        metrics_row[f'r2_train_{col}'] = r2_train
        metrics_row[f'mse_train_{col}'] = mse_train
        metrics_row[f'r2_test_{col}'] = r2_test
        metrics_row[f'mse_test_{col}'] = mse_test
       
       

    print("metrics_row:", metrics_row)
    # Append the row dictionaries as new rows in the DataFrames
    # Make sure metrics_row is a dict, not a list of dicts
    if isinstance(metrics_row, dict):
        row_df = pd.DataFrame([metrics_row])
    else:
        row_df = pd.DataFrame(metrics_row)

    metrics_df_Xgboost_3dof_data_test_series= pd.concat([metrics_df_Xgboost_3dof_data_test_series ,row_df], ignore_index=True)

    

In [None]:
# Base save folder
save_folder = "Xgboost/metrics_outpus/3dof/final"

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_3dof_data_test_series_ver2.csv")

# Save the DataFrame
metrics_df_Xgboost_3dof_data_test_series.to_csv(file_path, index=False)

print(f"Saved metrics_df_Xgboost_3dof_data_test_series to: {file_path}")

## Testing The best model

In [None]:
#loading best model

model = joblib.load('Xgboost_3dof_ver2_extra_na2_nb0_nf10.joblib')
    

In [None]:
# Define the best model parameters
best_na = 2
best_nb = 0
best_nf = 10


best_input_cols = ['eta', 'eta_velocity', 'eta_acceleration']
best_output_cols = ['heave','pitch','pendulum']
# load the best model

# print all test cases
test_cases = df_test_full['test_name'].unique()
print("All test cases:")
print(test_cases)  


In [None]:
metrics_df_test = []  # Will collect all test metrics
training_case='Tp6p8s_Hs2m'

In [None]:
# Select a test case
cases=df_test_full['test_name'].unique()
for case in cases:
    # Filter the test data
    df_case_test = df_test_full[df_test_full['test_name'] == case].reset_index(drop=True)
    

    # Prepare the test data
    dfx_test, dfy_test, yi_test = build_arx_lagged_with_scalers(
        df=df_case_test,
        input_cols=best_input_cols,
        output_cols=best_output_cols,
        scaler_X_func   = scaler_X_func_all,
        scaler_y_func   = scaler_y_func,
        na=best_na,
        nb_past=best_nb,
        nf_future=best_nf,
        test_name_col='test_name',
        y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
    )
    
    X_test_selected_df=dfx_test.drop(columns='test_name')

    

    # get feature names from model
    X_feature_names = X_test_selected_df.columns.tolist()

    # Predict on test data
    y_pred_test_scaled, x_used_test = predict_recursive_series(
        model=model,
        X_df=X_test_selected_df,
        output_cols=best_output_cols,
        X_feature_names=X_feature_names,
        na=best_na
    )

    # Inverse transform to original scale
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)
    # Convert to DataFrame
    y_pred_test_df = pd.DataFrame(y_pred_test, columns=best_output_cols)

    # Get true values aligned with dfy_test indexes
    y_true_test = scaler_y.inverse_transform(dfy_test[best_output_cols].reset_index(drop=True))
    y_true_test_df = pd.DataFrame(y_true_test, columns=best_output_cols)

    # Compute metrics
    metrics_test = {}
    metrics_test[f'test case'] = case
    if case == training_case:
        metrics_test[f'Comments'] = 'case used for training'
   
    for col in best_output_cols:
        mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
        r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

        
        metrics_test[f'r2_test_{col}'] = r2_test
        metrics_test[f'mse_test_{col}'] = mse_test


    # save predictions and true values
    y_true_test_df.to_csv(f'testing/XGB_true_{case}.csv', index=False)
    y_pred_test_df.to_csv(f'testing/XGB_pred_{case}.csv', index=False)
    print("metrics:", metrics_test)
    metrics_df_test.append(metrics_test)

In [None]:
# save metrics DataFrame
metrics_df_test.to_csv(f'testing/XGB_metrics_df_test.csv', index=False)

# Uni-variate Models

According to Desired input feature case, Input_cols and the x scale function are adjusted.

## 1-Heave

### Finding Optimal Lags

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_heave_only_ALL_ver2 = pd.DataFrame()
perf_df_Xgboost_heave_only_ALL_ver2 = pd.DataFrame()



In [None]:
# Load the hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params['heave'])


In [None]:
# define loop values
input_cols=['eta','eta_velocity', 'eta_acceleration']
output_cols=['heave'] 
na_max  = [2]
nb_max= [0,1,2,3,4,5,6,7]
nf_max= [0,1,2,3,4,5,6,7,8,9,10]
best_params = loaded_params['heave']


In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models/heave_only/extra'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

 # get  df at true scale witAh only this test case
df_case_train=df_train_full[df_train_full['test_name']==case].reset_index(drop=True)

for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_heave_eta_only'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the pipeline model
        model = XGBRegressor(**best_params)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_heave,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
        df_case_val=df_val_full[df_val_full['test_name']==case].reset_index(drop=True)

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all,
                scaler_y_func   = scaler_y_func_heave,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        # ============================
        # MEASURE CPU & MEMORY USAGE
        # ============================
        print(f'-----Training model----')
        process = psutil.Process()

        # Train the model
        start_time = time.perf_counter()
        model.fit(X_train  ,y_train_target )
        train_time = time.perf_counter() - start_time
        memory_usage_train = process.memory_info().rss / (1024 * 1024)  # MB


        # get feature names
        X_feature_names = X_train.columns.tolist()


        print(f'-----predicting on training data----')

        # Predict on train and validation data
        start_time = time.perf_counter()

        y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train[0:10000],
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )

        print(f'-----predicting on validation data----')
        
        y_pred_val_scaled , x_used_val=predict_recursive_series(
        model=model,
        X_df=X_val,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )



        # Inverse transform to original scale
        y_pred_train = scaler_y_heave.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y_heave.inverse_transform(y_pred_val_scaled)


        predict_time = time.perf_counter() - start_time
        memory_usage_predict = process.memory_info().rss / (1024 * 1024)  # MB

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y_heave.inverse_transform(dfy_train[output_cols][0:10000].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y_heave.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        perf_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'train_time': train_time,
        'train_memory_MB': memory_usage_train,
        'predict_time': predict_time,
        'predict_memory_MB': memory_usage_predict
            }
        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        print("metrics_row:", metrics_row)
        print("perf_row:", perf_row)
        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_heave_only_ALL_ver2 = pd.concat([metrics_df_Xgboost_heave_only_ALL_ver2, pd.DataFrame([metrics_row])], ignore_index=True)
        perf_df_Xgboost_heave_only_ALL_ver2 = pd.concat([perf_df_Xgboost_heave_only_ALL_ver2, pd.DataFrame([perf_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path) 




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/heave_only"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_heave_only_ALL_ver2.csv")

# Save the DataFrame
metrics_df_Xgboost_heave_only_ALL_ver2.to_csv(file_path, index=False)

print(f"Saved metrics_df_Xgboost_heave_only_ALL_ver2_extra_2 to: {file_path}")

# Define the file name
file_path = os.path.join(save_folder, "perf_df_Xgboost_heave_only_ALL_ver2.csv")

# Save the DataFrame

perf_df_Xgboost_heave_only_ALL_ver2.to_csv(file_path, index=False)

print(f"Saved perf_df_Xgboost_heave_only_ALL_ver2 to: {file_path}")



### Plotting Data From Remote Server

ver 1= eta only input
ver 2 = eta + eta vel + eta acc

In [None]:
# Load the metrics DataFrame
metrics_df_Xgboost_heave_only_ALL_ver1=pd.read_csv('Xgboost/R_M/metrics_outpus\heave_only/metrics_df_Xgboost_heave_only_ALL_ver1.csv')
metrics_df_Xgboost_heave_only_ALL_ver2=pd.read_csv('Xgboost/R_M/metrics_outpus\heave_only/metrics_df_Xgboost_heave_only_ALL_ver2.csv')
metrics_df_Xgboost_heave_only_ALL_ver2_extra=pd.read_csv('Xgboost/R_M/metrics_outpus\heave_only/metrics_df_Xgboost_heave_only_ALL_ver2_extra.csv')
metrics_df_Xgboost_heave_only_ALL_ver2_extra_2=pd.read_csv('Xgboost/R_M/metrics_outpus\heave_only/metrics_df_Xgboost_heave_only_ALL_ver2_extra_2.csv')
metrics_df_Xgboost_heave_only_ALL_ver2_final=pd.concat([metrics_df_Xgboost_heave_only_ALL_ver2, metrics_df_Xgboost_heave_only_ALL_ver2_extra, metrics_df_Xgboost_heave_only_ALL_ver2_extra_2], ignore_index=True)

In [None]:
# plotting for case on eta + eta vel + eta acc as input features
# Set target output variable
target_var = 'heave'
df_plot = metrics_df_Xgboost_heave_only_ALL_ver2_final.copy()

# Filter for only na = 2
df_plot = df_plot[df_plot['na'] == 2]

# Unique nb values (will now be legend entries)
unique_nb = sorted(df_plot['nb'].unique())
color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(unique_nb)}

# Create subplots (Train and Validation)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["R² Train", "R² Val"],
    shared_yaxes=False
)

# Loop through nb values to plot curves vs nd
for nb_val in unique_nb:
    df_nb = df_plot[df_plot['nb'] == nb_val].copy().sort_values(by='nf')
    color = color_map[nb_val]
    
    # Check if nb is the one to highlight
    if nb_val == 0:
        line_width = 4    # Thicker line for highlighting
        marker_size = 12  # Larger markers for highlighting
        marker_symbol = 'star'
    else:
        line_width = 2
        marker_size = 8
        marker_symbol = 'circle'

    # Compute nd = -nf
    df_nb['nd'] = -df_nb['nf']

    # Add train R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_train_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            line=dict(color=color, width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=1
    )

    # Add val R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_val_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            showlegend=False,
            line=dict(color=color , width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=2
    )

# Layout tweaks
fig.update_layout(
    title_text=("R² Performance of One XGBoost Model for Each DOF<br>η,  η̇ , and η̈  as Input Features, na = 2 (Heave)"),
    xaxis_title="nd ",
    xaxis2_title="nd",
    yaxis_title="R² Score",
    template="plotly_white",
    height=460,
    width=1000,
    legend_title="nb"
)
fig.update_xaxes(autorange="reversed", row=1, col=1)
fig.update_xaxes(autorange="reversed", row=1, col=2)
fig.show()



In [None]:
# plotting in case of eta only as an input feature

# Set target output variable
target_var = 'heave'
df_plot = metrics_df_Xgboost_heave_only_ALL_ver2_final.copy()

# Filter for only na = 2
df_plot = df_plot[df_plot['na'] == 2]

# Unique nb values (will now be legend entries)
unique_nb = sorted(df_plot['nb'].unique())
color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(unique_nb)}

# Create subplots (Train and Validation)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["R² Train", "R² Val"],
    shared_yaxes=False
)

# Loop through nb values to plot curves vs nd
for nb_val in unique_nb:
    df_nb = df_plot[df_plot['nb'] == nb_val].copy().sort_values(by='nf')
    color = color_map[nb_val]
    
    

    # Compute nd = -nf
    df_nb['nd'] = -df_nb['nf']

    # Add train R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_train_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            line=dict(color=color)
        ),
        row=1, col=1
    )

    # Add val R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_val_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            showlegend=False,
            line=dict(color=color)
        ),
        row=1, col=2
    )

# Layout tweaks
fig.update_layout(
    title_text=("R² Performance of One XGBoost Model for Each DOF<br>η Only as an Input Feature, na = 2 (Heave)"),
    xaxis_title="nd ",
    xaxis2_title="nd",
    yaxis_title="R² Score",
    template="plotly_white",
    height=460,
    width=1000,
    legend_title="nb"
)
fig.update_xaxes(autorange="reversed", row=1, col=1)
fig.update_xaxes(autorange="reversed", row=1, col=2)
fig.show()



### loading models to calculate full R^2 score

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/saved_models/heave_only/best_combination'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model


In [None]:
# Initialize empty DataFrame 

train_metrics_df_Xgboost_heave= pd.DataFrame()

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave']
for fname, model in models.items():
    print(f"Filename: {fname}")
   # extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")
    
    # prepare training data
    dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_heave,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
    
    y_train_target = dfy_train[output_cols].reset_index(drop=True)

    X_train=dfx_train.drop(columns='test_name')

    X_feature_names = X_train.columns.tolist()
    
    # predict on train data
    y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )
    
    # Inverse transform to original scale
    y_pred_train = scaler_y_heave.inverse_transform(y_pred_train_scaled)
    y_pred_train_df= pd.DataFrame(y_pred_train, columns=output_cols)
    y_true_train =scaler_y_heave.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    # save model info to dfs
    metrics_row = {
    'na': na,
    'nb': nb,
    'nd': -1*nf,
            }
    
    for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

           
            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            
    train_metrics_df_Xgboost_heave = pd.concat([train_metrics_df_Xgboost_heave, pd.DataFrame([metrics_row])], ignore_index=True)


In [None]:
# save metrics DataFrame
train_metrics_df_Xgboost_heave.to_csv('Xgboost/metrics_outpus/heave_only/train_metrics_df_Xgboost_heave_full.csv', index=False)

### Testing The selected Models

In [None]:
training_case='Tp6p8s_Hs2m'
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave']

In [None]:
# Define the base folder where results will be saved
save_folder = 'XGboost/results/heave_model/test_best_models'  # you can change this to any directory you want

# Ensure the save directory exists
os.makedirs(save_folder, exist_ok=True)

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/saved_models/heave_only/best_combination'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model

In [None]:
all_models_metrics = {}  # Store separate metrics DataFrames per model
y_true_pred_dict = {}    # Store true and predicted values per model and case

for fname, model in models.items():
    print(f"Filename: {fname}")
    
    # Extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")

    metrics_df_test_all = []  # Per-model metrics list
    model_y_true_pred = {}          # Per-model true/predicted dict

    cases = df_test_full['test_name'].unique()

    for case in cases:
        df_case_test = df_test_full[df_test_full['test_name'] == case].reset_index(drop=True)

        # Prepare the test data
        dfx_test, dfy_test, yi_test = build_arx_lagged_with_scalers(
            df=df_case_test,
            input_cols=input_cols,
            output_cols=output_cols,
            scaler_X_func=scaler_X_func_all,
            scaler_y_func=scaler_y_func_heave,
            na=na,
            nb_past=nb,
            nf_future=nf,
            test_name_col='test_name',
            y_initial_mode='original'
        )

        X_test_selected_df = dfx_test.drop(columns='test_name')
        X_feature_names = X_test_selected_df.columns.tolist()

        # Predict
        y_pred_test_scaled, x_used_test = predict_recursive_series(
            model=model,
            X_df=X_test_selected_df,
            output_cols=output_cols,
            X_feature_names=X_feature_names,
            na=na
        )

        y_pred_test = scaler_y_heave.inverse_transform(y_pred_test_scaled)
        y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)

        y_true_test = scaler_y_heave.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
        y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)

        # Save predictions and ground truth
        model_y_true_pred[case] = {
            'y_true': y_true_test_df,
            'y_pred': y_pred_test_df
        }

        # Compute metrics
        metrics_test = {
            'model': fname,
            'na': na,
            'nb': nb,
            'nf': nf,
            'test case': case
        }

        if case == training_case:
            metrics_test['Comments'] = 'case used for training'

        for col in output_cols:
            mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
            r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

            metrics_test[f'r2_test_{col}'] = r2_test
            metrics_test[f'mse_test_{col}'] = mse_test

        metrics_df_test_all.append(metrics_test)

    # Store per-model data
    metrics_df_test_all = pd.DataFrame(metrics_df_test_all)
    all_models_metrics[fname] = metrics_df_test_all
    y_true_pred_dict[fname] = model_y_true_pred

    # Optionally display or save
    print(f"\nMetrics for model {fname}:")
    display(metrics_df_test_all)

    # Save the metrics DataFrame for this model
    model_save_path = os.path.join(save_folder, f'metrics_{fname}.csv')
    metrics_df_test_all.to_csv(model_save_path, index=False)
   
    # Save predictions and true values if needed
    for case, data in model_y_true_pred.items():
        y_true_path = os.path.join(save_folder, f'{fname}_case_{case}_y_true.csv')
        y_pred_path = os.path.join(save_folder, f'{fname}_case_{case}_y_pred.csv')
        
        data['y_true'].to_csv(y_true_path, index=False)
        data['y_pred'].to_csv(y_pred_path, index=False)

## 2-Pitch

### Finding Optimal Lags

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_pitch_only = pd.DataFrame()
perf_df_Xgboost_pitch_only = pd.DataFrame()

In [None]:
# Load The hyperparameters from the JSON file

with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params['pitch'])

In [None]:
# define loop values
input_cols=['eta','eta_velocity', 'eta_acceleration']
output_cols=['pitch'] 
na_max  = [2]
nb_max= [9,10,11]
nf_max= [0,1,2,3,4,5,6,7,8,9,10,11]
best_params = loaded_params['pitch']

In [None]:

for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_pitch_ver1_'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the pipeline model
        model = XGBRegressor(**best_params)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_pitch,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
        df_case_val=df_val_full[df_val_full['test_name']==case].reset_index(drop=True)

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all,
                scaler_y_func   = scaler_y_func_pitch,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        # ============================
        # MEASURE CPU & MEMORY USAGE
        # ============================
        print(f'-----Training model----')
        process = psutil.Process()

        # Train the model
        start_time = time.perf_counter()
        model.fit(X_train  ,y_train_target )
        train_time = time.perf_counter() - start_time
        memory_usage_train = process.memory_info().rss / (1024 * 1024)  # MB


        # get feature names
        X_feature_names = X_train.columns.tolist()


        print(f'-----predicting on training data----')

        # Predict on train and validation data
        start_time = time.perf_counter()

        y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train[0:10000],
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )

        print(f'-----predicting on validation data----')
        
        y_pred_val_scaled , x_used_val=predict_recursive_series(
        model=model,
        X_df=X_val,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )



        # Inverse transform to original scale
        y_pred_train = scaler_y_pitch.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y_pitch.inverse_transform(y_pred_val_scaled)


        predict_time = time.perf_counter() - start_time
        memory_usage_predict = process.memory_info().rss / (1024 * 1024)  # MB

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y_pitch.inverse_transform(dfy_train[output_cols][0:10000].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y_pitch.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        perf_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'train_time': train_time,
        'train_memory_MB': memory_usage_train,
        'predict_time': predict_time,
        'predict_memory_MB': memory_usage_predict
            }
        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        print("metrics_row:", metrics_row)
        print("perf_row:", perf_row)
        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_pitch_only = pd.concat([metrics_df_Xgboost_pitch_only, pd.DataFrame([metrics_row])], ignore_index=True)
        perf_df_Xgboost_pitch_only = pd.concat([perf_df_Xgboost_pitch_only, pd.DataFrame([perf_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path) 




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/pitch_only"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_pitch_only.csv")

# Save the DataFrame
metrics_df_Xgboost_pitch_only.to_csv(file_path, index=False)

print(f"Saved metrics_df_Xgboost_pitch_only to: {file_path}")

# Define the file name
file_path = os.path.join(save_folder, "perf_df_Xgboost_pitch_only.csv")

# Save the DataFrame
perf_df_Xgboost_pitch_only.to_csv(file_path, index=False)

print(f"Saved perf_df_Xgboost_pitch_only to: {file_path}")



### Plotting Data From Remote Server

ver 1= eta only input
ver 2 = eta + eta vel + eta acc

In [None]:
# Load the metrics DataFrame
metrics_df_Xgboost_pitch_only_ver1= pd.read_csv('Xgboost/R_M/metrics_outpus/pitch_only/metrics_df_Xgboost_pitch_only_ver1.csv')
metrics_df_Xgboost_pitch_only_ver2= pd.read_csv('Xgboost/R_M/metrics_outpus/pitch_only/metrics_df_Xgboost_pitch_only_ver2.csv')
metrics_df_Xgboost_pitch_only_ver2_extra= pd.read_csv('Xgboost/R_M/metrics_outpus/pitch_only/metrics_df_Xgboost_pitch_only_ver2_extra.csv')
metrics_df_Xgboost_pitch_only_ver2_extra_2= pd.read_csv('Xgboost/R_M/metrics_outpus/pitch_only/metrics_df_Xgboost_pitch_only_ver2_extra_2.csv')
metrics_df_Xgboost_pitch_only_ver2_extra_3= pd.read_csv('Xgboost/R_M/metrics_outpus/pitch_only/metrics_df_Xgboost_pitch_only_ver2_extra_3.csv')
metrics_df_Xgboost_pitch_only_ver2_extra_4  = pd.read_csv('Xgboost/R_M/metrics_outpus/pitch_only/metrics_df_Xgboost_pitch_only_ver2_extra_4.csv')
metrics_df_Xgboost_pitch_only_ver2_final= pd.concat([metrics_df_Xgboost_pitch_only_ver2,metrics_df_Xgboost_pitch_only_ver2_extra, metrics_df_Xgboost_pitch_only_ver2_extra_2,metrics_df_Xgboost_pitch_only_ver2_extra_3,metrics_df_Xgboost_pitch_only_ver2_extra_4], ignore_index=True)

In [None]:
# plotting for case on eta only as and input feature

# Set target output variable
target_var = 'pitch'
df_plot = metrics_df_Xgboost_pitch_only_ver1.copy()

# Filter for only na = 2
df_plot = df_plot[df_plot['na'] == 2]

# Unique nb values (will now be legend entries)
unique_nb = sorted(df_plot['nb'].unique())
color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(unique_nb)}

# Create subplots (Train and Validation)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["R² Train", "R² Val"],
    shared_yaxes=False
)

# Loop through nb values to plot curves vs nd
for nb_val in unique_nb:
    df_nb = df_plot[df_plot['nb'] == nb_val].copy().sort_values(by='nf')
    color = color_map[nb_val]
    
    # Check if nb is the one to highlight
    if nb_val == -1:
        line_width = 4    # Thicker line for highlighting
        marker_size = 12  # Larger markers for highlighting
        marker_symbol = 'star'
    else:
        line_width = 2
        marker_size = 8
        marker_symbol = 'circle'

    # Compute nd = -nf
    df_nb['nd'] = -df_nb['nf']

    # Add train R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_train_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            line=dict(color=color, width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=1
    )

    # Add val R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_val_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            showlegend=False,
            line=dict(color=color , width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=2
    )

# Layout tweaks
fig.update_layout(
    title_text=("R² Performance of One XGBoost Model for Each DOF<br>η Only as an Input Feature, na = 2 (Pitch)"),
    xaxis_title="nd ",
    xaxis2_title="nd",
    yaxis_title="R² Score",
    template="plotly_white",
    height=460,
    width=1000,
    legend_title="nb",
    yaxis1=dict(range=[0, 1]),  # Set y-axis limits to show R² clearly
    yaxis2=dict(range=[0, 1]),  # Set y-axis limits to show R² clearly
)
fig.update_xaxes(autorange="reversed", row=1, col=1)
fig.update_xaxes(autorange="reversed", row=1, col=2)
fig.show()



In [None]:
# plotting for case on eta + eta vel + eta acc as input features

# Set target output variable
target_var = 'pitch'
df_plot = metrics_df_Xgboost_pitch_only_ver2_final.copy()

# Filter for only na = 2
df_plot = df_plot[df_plot['na'] == 2]

# Unique nb values (will now be legend entries)
unique_nb = sorted(df_plot['nb'].unique())
color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(unique_nb)}
color_map[10] = px.colors.qualitative.D3[8]  # just another predefined color
color_map[11] = px.colors.qualitative.D3[3]  # just another predefined color
# Create subplots (Train and Validation)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["R² Train", "R² Val"],
    shared_yaxes=True
)

# Loop through nb values to plot curves vs nd
for nb_val in unique_nb:
    df_nb = df_plot[df_plot['nb'] == nb_val].copy().sort_values(by='nf')
    color = color_map[nb_val]
    
    # Check if nb is the one to highlight
    if nb_val == 0:
        line_width = 4    # Thicker line for highlighting
        marker_size = 12  # Larger markers for highlighting
        marker_symbol = 'star'
    else:
        line_width = 2
        marker_size = 8
        marker_symbol = 'circle'

    # Compute nd = -nf
    df_nb['nd'] = -df_nb['nf']

    # Add train R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_train_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            line=dict(color=color, width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=1
    )

    # Add val R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_val_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            showlegend=False,
            line=dict(color=color , width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=2
    )

# Layout tweaks
fig.update_layout(
    title_text=("R² Performance of One XGBoost Model for Each DOF<br>η, η̇ , and η̈  as Input Features, na = 2 (Pitch)"),  
    xaxis_title="nd ",
    xaxis2_title="nd",
    yaxis_title="R² Score",
    template="plotly_white",
    height=460,
    width=1000,
    legend_title="nb",
    
 
)
 
fig.update_yaxes(showticklabels=True, row=1, col=1)  # Left
fig.update_yaxes(showticklabels=True, row=1, col=2)  # Right
fig.update_xaxes(autorange="reversed", row=1, col=1)
fig.update_xaxes(autorange="reversed", row=1, col=2)
fig.show()



### loading models to calculate full R^2 score

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/saved_models/pitch/check'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model

In [None]:
# Initialize empty DataFrame
train_metrics_df_Xgboost_pitch_new= pd.DataFrame()

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['pitch']
for fname, model in models.items():
    print(f"Filename: {fname}")
   # extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")
    
    # prepare training data
    dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_pitch,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
    
    y_train_target = dfy_train[output_cols].reset_index(drop=True)

    X_train=dfx_train.drop(columns='test_name')

    X_feature_names = X_train.columns.tolist()
    
    # predict on train data
    y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )
    
    # Inverse transform to original scale
    y_pred_train = scaler_y_pitch.inverse_transform(y_pred_train_scaled)
    y_pred_train_df= pd.DataFrame(y_pred_train, columns=output_cols)
    y_true_train =scaler_y_pitch.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    # save model info to dfs
    metrics_row = {
    'na': na,
    'nb': nb,
    'nd': -1*nf,
            }
    
    for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

           
            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            
    train_metrics_df_Xgboost_pitch = pd.concat([train_metrics_df_Xgboost_pitch, pd.DataFrame([metrics_row])], ignore_index=True)


In [None]:
# save metrics DataFrame
train_metrics_df_Xgboost_pitch_new.to_csv('Xgboost/metrics_outpus/pitch_only/train_metrics_df_Xgboost_pitch_new.csv', index=False)

### Testing The selected Models

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/saved_models/pitch/best_old'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model


In [None]:
training_case='Tp6p8s_Hs2m'
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['pitch']

In [None]:
# Define the base folder where results will be saved
save_folder = 'XGboost/results/1dof_model/pitch/test_best_models'  # you can change this to any directory you want

# Ensure the save directory exists
os.makedirs(save_folder, exist_ok=True)

In [None]:
all_models_metrics = {}  # Store separate metrics DataFrames per model
y_true_pred_dict = {}    # Store true and predicted values per model and case

for fname, model in models.items():
    print(f"Filename: {fname}")
    
    # Extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")

    metrics_df_test_all = []  # Per-model metrics list
    model_y_true_pred = {}          # Per-model true/predicted dict

    cases = df_test_full['test_name'].unique()

    for case in cases:
        df_case_test = df_test_full[df_test_full['test_name'] == case].reset_index(drop=True)

        # Prepare the test data
        dfx_test, dfy_test, yi_test = build_arx_lagged_with_scalers(
            df=df_case_test,
            input_cols=input_cols,
            output_cols=output_cols,
            scaler_X_func=scaler_X_func_all,
            scaler_y_func=scaler_y_func_pitch,
            na=na,
            nb_past=nb,
            nf_future=nf,
            test_name_col='test_name',
            y_initial_mode='original'
        )

        X_test_selected_df = dfx_test.drop(columns='test_name')
        X_feature_names = X_test_selected_df.columns.tolist()

        # Predict
        y_pred_test_scaled, x_used_test = predict_recursive_series(
            model=model,
            X_df=X_test_selected_df,
            output_cols=output_cols,
            X_feature_names=X_feature_names,
            na=na
        )

        y_pred_test = scaler_y_pitch.inverse_transform(y_pred_test_scaled)
        y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)

        y_true_test = scaler_y_pitch.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
        y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)

        # Save predictions and ground truth
        model_y_true_pred[case] = {
            'y_true': y_true_test_df,
            'y_pred': y_pred_test_df
        }

        # Compute metrics
        metrics_test = {
            'model': fname,
            'na': na,
            'nb': nb,
            'nf': nf,
            'test case': case
        }

        if case == training_case:
            metrics_test['Comments'] = 'case used for training'

        for col in output_cols:
            mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
            r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

            metrics_test[f'r2_test_{col}'] = r2_test
            metrics_test[f'mse_test_{col}'] = mse_test

        metrics_df_test_all.append(metrics_test)

    # Store per-model data
    metrics_df_test_all = pd.DataFrame(metrics_df_test_all)
    all_models_metrics[fname] = metrics_df_test_all
    y_true_pred_dict[fname] = model_y_true_pred

    # Optionally display or save
    print(f"\nMetrics for model {fname}:")
    display(metrics_df_test_all)

   # Save the metrics DataFrame for this model
    metric_save_path = os.path.join(save_folder, f'metrics_pitch_{fname}.csv')
    metrics_df_test_all.to_csv(metric_save_path, index=False)
   
    # Save predictions and true values if needed
    for case, data in model_y_true_pred.items():
        y_true_path = os.path.join(save_folder, f'{fname}_case_pitch_{case}_y_true.csv')
        y_pred_path = os.path.join(save_folder, f'{fname}_case_pitch_{case}_y_pred.csv')
        
        data['y_true'].to_csv(y_true_path, index=False)
        data['y_pred'].to_csv(y_pred_path, index=False)

## 3- Pendulum 

### Finding Optimal Lags

In [None]:
# Base save folder
save_folder = 'Xgboost/saved_models'

# Create the directory if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

In [None]:
# Initialize empty DataFrames before the loop
metrics_df_Xgboost_pendulum_only = pd.DataFrame()
perf_df_Xgboost_pendulum_only = pd.DataFrame()

In [None]:
# define loop values
input_cols=['eta','eta_velocity', 'eta_acceleration']
output_cols=['pendulum'] 
case='Tp6p8s_Hs2m'
na_max  = [2]
nb_max= [0,1,2,3,4,5,6,7,8,9,10]
nf_max=[0,1,2,3,4,5,6,7,8,9,10]
best_params_pendulum = loaded_params['pendulum']


In [None]:
# Load the hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params['pendulum'])

In [None]:

for na in na_max :
    for nb in nb_max :
      for nf in nf_max:
      

        # Model parameters
        model_name = F'Xgboost_pendulum_ver1_'  + '_na' + str(na) + '_nb' + str(nb) + '_nf' + str(nf)


        # Create the pipeline model
        model = XGBRegressor(**best_params)
            
        # prepare training data 
        print(f'-----Preprocessing case of na={str(na)} ,nb={str(nb)} and nf={str(nf)} ----')
    
       
        dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func ,
                scaler_y_func   = scaler_y_func_pend,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
        y_train_target = dfy_train[output_cols].reset_index(drop=True)

        X_train=dfx_train.drop(columns='test_name')

        # prepare validation data
        df_case_val=df_val_full[df_val_full['test_name']==case].reset_index(drop=True)

        dfx_val,dfy_val,yi_val = build_arx_lagged_with_scalers(
                df = df_case_val,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func,
                scaler_y_func   = scaler_y_func_pend,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
        
        y_val_target = dfy_val[output_cols].reset_index(drop=True)

        X_val=dfx_val.drop(columns='test_name')
      
        # ============================
        # MEASURE CPU & MEMORY USAGE
        # ============================
        print(f'-----Training model----')
        process = psutil.Process()

        # Train the model
        start_time = time.perf_counter()
        model.fit(X_train  ,y_train_target )
        train_time = time.perf_counter() - start_time
        memory_usage_train = process.memory_info().rss / (1024 * 1024)  # MB


        # get feature names
        X_feature_names = X_train.columns.tolist()


        print(f'-----predicting on training data----')

        # Predict on train and validation data
        start_time = time.perf_counter()

        y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train[0:10000],
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )

        print(f'-----predicting on validation data----')
        
        y_pred_val_scaled , x_used_val=predict_recursive_series(
        model=model,
        X_df=X_val,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )



        # Inverse transform to original scale
        y_pred_train = scaler_y_pend.inverse_transform(y_pred_train_scaled)
        y_pred_val = scaler_y_pend.inverse_transform(y_pred_val_scaled)


        predict_time = time.perf_counter() - start_time
        memory_usage_predict = process.memory_info().rss / (1024 * 1024)  # MB

        # Convert predictions to DataFrames for easier handling
        y_pred_train_df = pd.DataFrame(y_pred_train, columns=output_cols)
        y_pred_val_df = pd.DataFrame(y_pred_val, columns=output_cols)


        # Get true values aligned with dfy_train and dfy_val indexes
        y_true_train =scaler_y_pend.inverse_transform(dfy_train[output_cols][0:10000].reset_index(drop=True))
        y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

        y_true_val = scaler_y_pend.inverse_transform(dfy_val[output_cols].reset_index(drop=True))
        y_true_val_df = pd.DataFrame(y_true_val, columns=output_cols)



        # save model info to dfs
        metrics_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'test case': case,
        }

        perf_row = {
        'model_name': model_name,
        'na': na,
        'nb': nb,
        'nf': nf,
        'train_time': train_time,
        'train_memory_MB': memory_usage_train,
        'predict_time': predict_time,
        'predict_memory_MB': memory_usage_predict
            }
        print(f'-----Eavluating model----')
        # Compute metrics
        if np.any(np.isnan(y_pred_train_df)) or np.any(np.isinf(y_pred_train_df)):
            print("[WARN] Train predictions have NaNs or infs.")

        for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

            mse_val = mean_squared_error(y_true_val_df[col], y_pred_val_df[col])
            r2_val = r2_score(y_true_val_df[col], y_pred_val_df[col])

            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            metrics_row[f'r2_val_{col}'] = r2_val
            metrics_row[f'mse_val_{col}'] = mse_val

        print("metrics_row:", metrics_row)
        print("perf_row:", perf_row)
        # Append the row dictionaries as new rows in the DataFrames
        metrics_df_Xgboost_pendulum_only= pd.concat([metrics_df_Xgboost_pendulum_only, pd.DataFrame([metrics_row])], ignore_index=True)
        perf_df_Xgboost_pendulum_only = pd.concat([perf_df_Xgboost_pendulum_only, pd.DataFrame([perf_row])], ignore_index=True)
        print(f'-----Saving model----')
        # Save the trained model
        model_save_name = f"{model_name}.joblib"  # You already have model_name variable!
        model_save_path = os.path.join(save_folder, model_save_name)

        # Save with joblib
        joblib.dump(model, model_save_path) 




In [None]:
# save results 
# Define your new folder path
save_folder = "Xgboost/metrics_outpus/pendulum_only"

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)
# Define the file name
file_path = os.path.join(save_folder, "metrics_df_Xgboost_pendulum_only.csv")

# Save the DataFrame
metrics_df_Xgboost_pendulum_only.to_csv(file_path, index=False)

print(f"Saved metrics_df_Xgboost_pendulum_only to: {file_path}")

# Define the file name
file_path = os.path.join(save_folder, "perf_df_Xgboost_pendulum_only.csv")

# Save the DataFrame
perf_df_Xgboost_pendulum_only.to_csv(file_path, index=False)

print(f"Saved perf_df_Xgboost_pendulum_only to: {file_path}")



### Plotting Data From Remote Server

ver 1= eta only input
ver 2 = eta + eta vel + eta acc

In [None]:
# Load the metrics DataFrame
metrics_df_Xgboost_pendulum_only_ver1= pd.read_csv('Xgboost/R_M/metrics_outpus/pendulum_only/metrics_df_Xgboost_pendulum_only_ver1.csv')
metrics_df_Xgboost_pendulum_only_ver2= pd.read_csv('Xgboost/R_M/metrics_outpus/pendulum_only/metrics_df_Xgboost_pendulum_only_ver2.csv')
metrics_df_Xgboost_pendulum_only_ver2_extra= pd.read_csv('Xgboost/R_M/metrics_outpus/pendulum_only/metrics_df_Xgboost_pendulum_only_ver2_extra.csv')
metrics_df_Xgboost_pendulum_only_ver2_extra_2=pd.read_csv('Xgboost/R_M/metrics_outpus/pendulum_only/metrics_df_Xgboost_pendulum_only_ver2_extra_2.csv')
metrics_df_Xgboost_pendulum_only_ver2_final=pd.concat([metrics_df_Xgboost_pendulum_only_ver2, metrics_df_Xgboost_pendulum_only_ver2_extra, metrics_df_Xgboost_pendulum_only_ver2_extra_2], ignore_index=True)

In [None]:
# plotting in case of eta only as an input feature

# Set target output variable
target_var = 'pendulum'
df_plot = metrics_df_Xgboost_pendulum_only_ver1.copy()

# Filter for only na = 2
df_plot = df_plot[df_plot['na'] == 2]

# Unique nb values (will now be legend entries)
unique_nb = sorted(df_plot['nb'].unique())
color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(unique_nb)}

# Create subplots (Train and Validation)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["R² Train", "R² Val"],
    shared_yaxes=False
)

# Loop through nb values to plot curves vs nd
for nb_val in unique_nb:
    df_nb = df_plot[df_plot['nb'] == nb_val].copy().sort_values(by='nf')
    color = color_map[nb_val]
    
    # Check if nb is the one to highlight
    if nb_val == -1:
        line_width = 4    # Thicker line for highlighting
        marker_size = 12  # Larger markers for highlighting
        marker_symbol = 'star'
    else:
        line_width = 2
        marker_size = 8
        marker_symbol = 'circle'

    # Compute nd = -nf
    df_nb['nd'] = -df_nb['nf']

    # Add train R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_train_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            line=dict(color=color, width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=1
    )

    # Add val R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_val_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            showlegend=False,
            line=dict(color=color , width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=2
    )

# Layout tweaks
fig.update_layout(
    title_text=("R² Performance of One XGBoost Model for Each DOF<br>η Only as an Input Feature, na = 2 (Pendulum)"),
    xaxis_title="nd ",
    xaxis2_title="nd",
    yaxis_title="R² Score",
    template="plotly_white",
    height=460,
    width=1000,
    legend_title="nb",
    yaxis1=dict(range=[0, 0.6]),  # Set y-axis limits to show R² clearly
    yaxis2=dict(range=[0, 0.6]),  # Set y-axis limits to show R² clearly
)
fig.update_xaxes(autorange="reversed", row=1, col=1)
fig.update_xaxes(autorange="reversed", row=1, col=2)
fig.show()



In [None]:
# plotting for case on eta + eta vel + eta acc as input features

# Set target output variable
target_var = 'pendulum'
df_plot = metrics_df_Xgboost_pendulum_only_ver2_final.copy()

# Filter for only na = 2
df_plot = df_plot[df_plot['na'] == 2]

# Unique nb values (will now be legend entries)
unique_nb = sorted(df_plot['nb'].unique())
color_map = {nb: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, nb in enumerate(unique_nb)}
color_map[10] = px.colors.qualitative.D3[8]  # just another predefined color
color_map[11] = px.colors.qualitative.D3[3]  # just another predefined color
# Create subplots (Train and Validation)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["R² Train", "R² Val"],
    shared_yaxes=True
)

# Loop through nb values to plot curves vs nd
for nb_val in unique_nb:
    df_nb = df_plot[df_plot['nb'] == nb_val].copy().sort_values(by='nf')
    color = color_map[nb_val]
    
    # Check if nb is the one to highlight
    if nb_val == 22:
        line_width = 4    # Thicker line for highlighting
        marker_size = 12  # Larger markers for highlighting
        marker_symbol = 'star'
    else:
        line_width = 2
        marker_size = 8
        marker_symbol = 'circle'

    # Compute nd = -nf
    df_nb['nd'] = -df_nb['nf']

    # Add train R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_train_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            line=dict(color=color, width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=1
    )

    # Add val R² trace
    fig.add_trace(
        go.Scatter(
            x=df_nb['nd'],
            y=df_nb[f'r2_val_{target_var}'],
            mode='lines+markers',
            name=f'nb={nb_val}',
            legendgroup=f'nb={nb_val}',
            showlegend=False,
            line=dict(color=color , width=line_width),
            marker=dict(symbol=marker_symbol, size=marker_size, color=color)
        ),
        row=1, col=2
    )

# Layout tweaks
fig.update_layout(
    title_text="R² Performance of One XGBoost Model for Each DOF<br>η, η̇ , and η̈  as Input Features, na = 2 (Pendulum)",
    xaxis_title="nd ",
    xaxis2_title="nd",
    yaxis_title="R² Score",
    template="plotly_white",
    height=550,
    width=1000,
    legend_title="nb",
    
    
)
fig.update_yaxes(showticklabels=True, row=1, col=1)  # Left
fig.update_yaxes(showticklabels=True, row=1, col=2)  # Right
fig.update_xaxes(autorange="reversed", row=1, col=1)
fig.update_xaxes(autorange="reversed", row=1, col=2)
fig.show()



### loading models to calculate full R^2 score

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/saved_models/pendulum/best_combination'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model

In [None]:
# Initialize empty DataFrame
train_metrics_df_Xgboost_pendulum= pd.DataFrame()

In [None]:
# define loop values
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['pendulum']
for fname, model in models.items():
    print(f"Filename: {fname}")
   # extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")
    
    # prepare training data
    dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func_pend,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
    
    y_train_target = dfy_train[output_cols].reset_index(drop=True)

    X_train=dfx_train.drop(columns='test_name')

    X_feature_names = X_train.columns.tolist()
    
    # predict on train data
    y_pred_train_scaled,x_used_train = predict_recursive_series(
        model=model,
        X_df=X_train,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )
    
    # Inverse transform to original scale
    y_pred_train = scaler_y_pend.inverse_transform(y_pred_train_scaled)
    y_pred_train_df= pd.DataFrame(y_pred_train, columns=output_cols)
    y_true_train =scaler_y_pend.inverse_transform(dfy_train[output_cols].reset_index(drop=True))
    y_true_train_df = pd.DataFrame(y_true_train, columns=output_cols)

    # save model info to dfs
    metrics_row = {
    'na': na,
    'nb': nb,
    'nd': -1*nf,
            }
    
    for col in output_cols:
            mse_train = mean_squared_error(y_true_train_df[col], y_pred_train_df[col])
            r2_train = r2_score(y_true_train_df[col], y_pred_train_df[col])

           
            # Add to metrics row
            metrics_row[f'r2_train_{col}'] = r2_train
            metrics_row[f'mse_train_{col}'] = mse_train
            
    train_metrics_df_Xgboost_pendulum = pd.concat([train_metrics_df_Xgboost_pendulum, pd.DataFrame([metrics_row])], ignore_index=True)


In [None]:
# save metrics DataFrame
train_metrics_df_Xgboost_pendulum.to_csv('Xgboost/metrics_outpus/pendulum_only/train_metrics_df_Xgboost_pendulum.csv', index=False)


### Testing The selected Models

In [None]:
# Load all models from the specified folder
folder = 'Xgboost/R_M/saved_models/pendulum/best_combination'

# List all .joblib model files
model_files = [f for f in os.listdir(folder) if f.endswith('.joblib')]

# Load all models into a dictionary
models = {}
for fname in model_files:
    model_path = os.path.join(folder, fname)
    model = joblib.load(model_path)
    models[fname] = model


In [None]:
training_case='Tp6p8s_Hs2m'
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['pendulum']

In [None]:
# Define the base folder where results will be saved
save_folder = 'XGboost/results/1dof_model/pendulum/test_best_models'  # you can change this to any directory you want

# Ensure the save directory exists
os.makedirs(save_folder, exist_ok=True)

In [None]:
all_models_metrics = {}  # Store separate metrics DataFrames per model
y_true_pred_dict = {}    # Store true and predicted values per model and case

for fname, model in models.items():
    print(f"Filename: {fname}")
    
    # Extract na, nb, nf from filenames
    pattern = r'na(\d+)_nb(\d+)_nf(\d+)'
    match = re.search(pattern, fname)
    na, nb, nf = map(int, match.groups())
    print(f"{fname} ➤ na: {na}, nb: {nb}, nf: {nf}")

    metrics_df_test_all = []  # Per-model metrics list
    model_y_true_pred = {}          # Per-model true/predicted dict

    cases = df_test_full['test_name'].unique()

    for case in cases:
        df_case_test = df_test_full[df_test_full['test_name'] == case].reset_index(drop=True)

        # Prepare the test data
        dfx_test, dfy_test, yi_test = build_arx_lagged_with_scalers(
            df=df_case_test,
            input_cols=input_cols,
            output_cols=output_cols,
            scaler_X_func=scaler_X_func_all,
            scaler_y_func=scaler_y_func_pend,
            na=na,
            nb_past=nb,
            nf_future=nf,
            test_name_col='test_name',
            y_initial_mode='original'
        )

        X_test_selected_df = dfx_test.drop(columns='test_name')
        X_feature_names = X_test_selected_df.columns.tolist()

        # Predict
        y_pred_test_scaled, x_used_test = predict_recursive_series(
            model=model,
            X_df=X_test_selected_df,
            output_cols=output_cols,
            X_feature_names=X_feature_names,
            na=na
        )

        y_pred_test = scaler_y_pend.inverse_transform(y_pred_test_scaled)
        y_pred_test_df = pd.DataFrame(y_pred_test, columns=output_cols)

        y_true_test = scaler_y_pend.inverse_transform(dfy_test[output_cols].reset_index(drop=True))
        y_true_test_df = pd.DataFrame(y_true_test, columns=output_cols)

        # Save predictions and ground truth
        model_y_true_pred[case] = {
            'y_true': y_true_test_df,
            'y_pred': y_pred_test_df
        }

        # Compute metrics
        metrics_test = {
            'model': fname,
            'na': na,
            'nb': nb,
            'nf': nf,
            'test case': case
        }

        if case == training_case:
            metrics_test['Comments'] = 'case used for training'

        for col in output_cols:
            mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
            r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

            metrics_test[f'r2_test_{col}'] = r2_test
            metrics_test[f'mse_test_{col}'] = mse_test

        metrics_df_test_all.append(metrics_test)

    # Store per-model data
    metrics_df_test_all = pd.DataFrame(metrics_df_test_all)
    all_models_metrics[fname] = metrics_df_test_all
    y_true_pred_dict[fname] = model_y_true_pred

    # Optionally display or save
    print(f"\nMetrics for model {fname}:")
    display(metrics_df_test_all)

   # Save the metrics DataFrame for this model
    metric_save_path = os.path.join(save_folder, f'metrics_pendulum_{fname}.csv')
    metrics_df_test_all.to_csv(metric_save_path, index=False)
   
    # Save predictions and true values if needed
    for case, data in model_y_true_pred.items():
        y_true_path = os.path.join(save_folder, f'{fname}_case_pendulum_{case}_y_true.csv')
        y_pred_path = os.path.join(save_folder, f'{fname}_case_pendulum_{case}_y_pred.csv')
        
        data['y_true'].to_csv(y_true_path, index=False)
        data['y_pred'].to_csv(y_pred_path, index=False)

# Computing time and emissions

In [None]:
import codecarbon
print(codecarbon.__version__)

from codecarbon import EmissionsTracker




We adjust the data sizes to be devisible by 64 (like in lstm case) for a fair comparison

In [None]:
batch_size=64

def adjust_for_batch_df(df, y):
    # Calculate the number of rows that are divisible by the batch size
    n = (df.shape[0] // batch_size) * batch_size
    
    # Adjust DataFrame and target variable
    df_adjusted = df.iloc[:n]  # Select the rows to match the batch size
    y_adjusted = y[:n]         # Adjust the target variable in the same way
    
    return df_adjusted, y_adjusted


In [None]:
# Load the hyperparameters from the JSON file
with open("Xgboost/hyperparameters/eta_Vel_acc/best_params_eta_Vel_acc.json", "r") as f:
    loaded_params = json.load(f)

print("Loaded params:", loaded_params)
# Accumulate values
accumulator = defaultdict(list)
for params in loaded_params.values():
    for key, val in params.items():
        accumulator[key].append(val)

# Define keys you want to round to integers
int_keys = {"max_depth", "n_estimators"}

# Compute averages
xgb_params = {
    k: int(np.round(np.mean(v))) if k in int_keys else float(np.mean(v))
    for k, v in accumulator.items()
}

print("Averaged parameters:", xgb_params)


In [None]:
case='Tp6p8s_Hs2m'
case_test='Tp6p8s_Hs1m'
df_case_train=df_train_full[df_train_full['test_name']==case].reset_index(drop=True)
df_case_test=df_train_full[df_train_full['test_name']==case_test].reset_index(drop=True)

In [None]:
# prepare training data
input_cols=['eta','eta_velocity','eta_acceleration']
output_cols=['heave', 'pitch',  'pendulum'] 
na  = 2
nb= 0
nf = 10

dfx_train,dfy_train,yi_train = build_arx_lagged_with_scalers(
                df = df_case_train,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
y_train_target = dfy_train[output_cols].reset_index(drop=True)

X_train=dfx_train.drop(columns='test_name')
# get feature names
X_feature_names = X_train.columns.tolist()

X_train, y_train_target = adjust_for_batch_df(X_train, y_train_target)

In [None]:
process = psutil.Process()
start_time = time.time()  # Start time for measurement
initial_memory = process.memory_info().rss / (1024 ** 2)  # Memory in MB

# Create the base model
base_model = XGBRegressor(**xgb_params)

# Wrap it for multi-output regression
model = MultiOutputRegressor(base_model)

model.fit(X_train  ,y_train_target )

# Measure memory usage and time after prediction
end_time = time.time()  # End time for measurement
final_memory = process.memory_info().rss / (1024 ** 2)  # Memory in MB
# Print results
print(f"Time taken : {end_time - start_time} seconds")
print(f"Memory used: {final_memory - initial_memory} MB")

In [None]:
tracker = EmissionsTracker(
    output_dir="Results/co2",       # Custom folder
    output_file="xgb_train.csv"    # Custom filename
)

tracker.start()
# Create the base model
base_model = XGBRegressor(**xgb_params)

# Wrap it for multi-output regression
model = MultiOutputRegressor(base_model)

model.fit(X_train  ,y_train_target )
# Stop the tracker and retrieve the estimated emissions
emissions = tracker.stop()

In [None]:
# prepare test data
dfx_test,dfy_test,yi_test = build_arx_lagged_with_scalers(
                df = df_case_test,
                input_cols  = input_cols,
                output_cols   = output_cols,
                scaler_X_func   = scaler_X_func_all ,
                scaler_y_func   = scaler_y_func,
                na=na,
                nb_past=nb,
                nf_future=nf,
                test_name_col='test_name',
                y_initial_mode='zero'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
        )
            
y_test_target = dfy_test[output_cols].reset_index(drop=True)

x_test=dfx_test.drop(columns='test_name')

x_test, y_test_target = adjust_for_batch_df(x_test, y_test_target)

In [None]:
process = psutil.Process()
start_time = time.time()  # Start time for measurement
initial_memory = process.memory_info().rss / (1024 ** 2)  # Memory in MB
# parralel prediction 

y_pred_parralel_test=model.predict(x_test)



# Measure memory usage and time after prediction
end_time = time.time()  # End time for measurement
final_memory = process.memory_info().rss / (1024 ** 2)  # Memory in MB
# Print results
print(f"Time taken : {end_time - start_time} seconds")
print(f"Memory used: {final_memory - initial_memory} MB")

In [None]:
tracker = EmissionsTracker(
    output_dir="Results/co2",       # Custom folder
    output_file="xgb_parralel_pred.csv"    # Custom filename
)
tracker.start()

# parralel prediction 

y_pred_parralel_test=model.predict(x_test)


# Stop the tracker and retrieve the estimated emissions
emissions = tracker.stop()


In [None]:
process = psutil.Process()
start_time = time.time()  # Start time for measurement
initial_memory = process.memory_info().rss / (1024 ** 2)  # Memory in MB
# Series prediction 

y_pred_test_scaled,x_used_test = predict_recursive_series(
        model=model,
        X_df=x_test,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )



# Measure memory usage and time after prediction
end_time = time.time()  # End time for measurement
final_memory = process.memory_info().rss / (1024 ** 2)  # Memory in MB
# Print results
print(f"Time taken : {end_time - start_time} seconds")
print(f"Memory used: {final_memory - initial_memory} MB")

In [None]:
tracker = EmissionsTracker(
    output_dir="Results/co2",       # Custom folder
    output_file="xgb_series.csv"    # Custom filename
)
tracker.start()
# Series prediction 

y_pred_test_scaled,x_used_test = predict_recursive_series(
        model=model,
        X_df=x_test,
        output_cols=output_cols,
        X_feature_names=X_feature_names,
        na=na
            )



# Stop the tracker and retrieve the estimated emissions
emissions = tracker.stop()

# Noisy data test

In [None]:
# load noisy data
model = joblib.load('Xgboost/Xgboost_3dof_ver2_extra_na2_nb0_nf10.joblib')


In [None]:
# now get the best model and predict on the test data
import os

# Define the best model parameters
best_na = 2
best_nb = 0
best_nf = 10


best_input_cols = ['eta', 'eta_velocity', 'eta_acceleration']
best_output_cols = ['heave','pitch','pendulum']
# load the best model


In [None]:
# load data
df_case_test_noisy=pd.read_csv('Results/df_case_test_noisy.csv')
scaler_X_func_all

In [None]:
# Prepare the test data
dfx_test, dfy_test, yi_test = build_arx_lagged_with_scalers(
    df=df_case_test_noisy.reset_index(drop=True),
    input_cols=best_input_cols,
    output_cols=best_output_cols,
    scaler_X_func   = scaler_X_func_all,
    scaler_y_func   = scaler_y_func,
    na=best_na,
    nb_past=best_nb,
    nf_future=best_nf,
    test_name_col='test_name',
    y_initial_mode='original'  # 'original' ➔ skip initial rows, 'zero' ➔ pad lags with zeros
)


In [None]:

X_test_selected_df=dfx_test.drop(columns='test_name')



# get feature names from model
X_feature_names = X_test_selected_df.columns.tolist()

# Predict on test data
y_pred_test_scaled, x_used_test = predict_recursive_series(
    model=model,
    X_df=X_test_selected_df,
    output_cols=best_output_cols,
    X_feature_names=X_feature_names,
    na=best_na
)

# Inverse transform to original scale
y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled)
# Convert to DataFrame
y_pred_test_df = pd.DataFrame(y_pred_test, columns=best_output_cols)

# Get true values aligned with dfy_test indexes
y_true_test = scaler_y.inverse_transform(dfy_test[best_output_cols].reset_index(drop=True))
y_true_test_df = pd.DataFrame(y_true_test, columns=best_output_cols)

# Compute metrics
metrics_test = {}


for col in best_output_cols:
    mse_test = mean_squared_error(y_true_test_df[col], y_pred_test_df[col])
    r2_test = r2_score(y_true_test_df[col], y_pred_test_df[col])

    
    metrics_test[f'r2_test_{col}'] = r2_test
    metrics_test[f'mse_test_{col}'] = mse_test



print("metrics:", metrics_test)


In [None]:
# save results
y_true_test_df.to_csv(f'Results/noisy test/XGB_true.csv', index=False)
y_pred_test_df.to_csv(f'Results/noisy test/XGB_pred.csv', index=False)