# 1. Loading

In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
from loguru import logger

random_seed = 42

In [22]:
WINDOW_LENGTH = "15min"

In [23]:
notebook_dir = os.getcwd()
notebook_parent_dir = os.path.dirname(notebook_dir)
notebook_parent_parent_dir = os.path.dirname(notebook_parent_dir)
aggregated_data_path = os.path.join(notebook_parent_parent_dir, 'data', 'aggregated', f'aggregated_{WINDOW_LENGTH}.csv')

In [24]:
df = pd.read_csv(aggregated_data_path)
df["window_start"] = pd.to_datetime(df["window_start"], format='ISO8601', utc=True)

In [25]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 18223 entries, 0 to 18222
Data columns (total 53 columns):
 #   Column                                                                 Non-Null Count  Dtype              
---  ------                                                                 --------------  -----              
 0   seg_id                                                                 18223 non-null  int64              
 1   window_start                                                           18223 non-null  datetime64[us, UTC]
 2   Vessel Hull Over Ground Speed (knots)                                  18223 non-null  float64            
 3   Vessel Hull Through Water Longitudinal Speed (knots)                   18223 non-null  float64            
 4   Vessel External Conditions Wind Relative Speed (knots)                 18223 non-null  float64            
 5   Vessel External Conditions Wind Relative Angle (degrees)               18223 non-null  float64            
 6   V

# 2. Modelling-specific preprocessing

# 3. Shaft power prediction benchmark

This section defines a predictor set and benchmarks multiple machine learning models to predict shaft power.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
TARGET_COLUMN = "Shaft power [kW]"

excluded_columns = {
    TARGET_COLUMN,
    "window_start",
    "window_end",
}

predictive_variables = [
    column
    for column in df.columns
    if column not in excluded_columns and pd.api.types.is_numeric_dtype(df[column])
]

print(f"Target variable: {TARGET_COLUMN}")
print(f"Number of predictive variables: {len(predictive_variables)}")
print("Predictive variables:")
for variable in predictive_variables:
    print(f"- {variable}")

In [None]:
model_df = (
    df[["window_start", TARGET_COLUMN] + predictive_variables]
    .sort_values("window_start")
    .dropna(subset=[TARGET_COLUMN])
    .copy()
)

X = model_df[predictive_variables]
y = model_df[TARGET_COLUMN]

print(f"Rows available for modelling: {len(model_df):,}")
print(f"Feature matrix shape: {X.shape}")

In [None]:
split_index = int(len(model_df) * 0.8)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

numeric_preprocessor = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_preprocessor, predictive_variables),
    ]
)

models = {
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(
        n_estimators=300,
        random_state=random_seed,
        n_jobs=-1,
    ),
    "GradientBoosting": GradientBoostingRegressor(random_state=random_seed),
}

print(f"Train rows: {len(X_train):,}")
print(f"Test rows:  {len(X_test):,}")

In [None]:
tscv = TimeSeriesSplit(n_splits=5)
scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae": "neg_mean_absolute_error",
    "r2": "r2",
}

results = []

for model_name, estimator in models.items():
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", estimator),
        ]
    )

    cv_scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=tscv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
    )

    pipeline.fit(X_train, y_train)
    y_pred_test = pipeline.predict(X_test)

    results.append(
        {
            "model": model_name,
            "cv_rmse_mean": -cv_scores["test_rmse"].mean(),
            "cv_mae_mean": -cv_scores["test_mae"].mean(),
            "cv_r2_mean": cv_scores["test_r2"].mean(),
            "test_rmse": mean_squared_error(y_test, y_pred_test, squared=False),
            "test_mae": mean_absolute_error(y_test, y_pred_test),
            "test_r2": r2_score(y_test, y_pred_test),
        }
    )

results_df = pd.DataFrame(results).sort_values("test_rmse", ascending=True)
results_df.reset_index(drop=True, inplace=True)
results_df