In [1]:
%load_ext autoreload
%autoreload 2

import dataclasses
import datetime
import itertools
import logging
import pathlib

import numpy as np
import pandas as pd
import sklearn.ensemble as ensemble
import sklearn.metrics
import sklearn.model_selection
import xarray as xr

import a6
import a6.datasets.coordinates as _coordinates
import a6.datasets.variables as _variables
import a6.utils as utils

WORKER_ID = 4

utils.logging.create_logger(
    global_rank=WORKER_ID,
    local_rank=WORKER_ID,
    verbose=True,
)

logger = logging.getLogger("notebook")

turbine_data_dir = pathlib.Path(
    "/p/home/jusers/emmerich1/juwels/data/production"
)
preprocessed_data_dir = pathlib.Path(
    "/p/home/jusers/emmerich1/juwels/data/production-processed"
)
results_dir = pathlib.Path(
    "/p/project/deepacf/maelstrom/emmerich1/data/forecast-errors"
)

turbine_files = a6.utils.paths.list_files(
    turbine_data_dir, pattern="**/*.nc", recursive=True
)

results = xr.open_dataset("/p/project/deepacf/maelstrom/emmerich1/data/pca_kpca_kmeans_lswrs_29_40.nc")
results_40 = results.sel(k=40)
gwl = xr.open_dataset("/p/home/jusers/emmerich1/juwels/code/a6/src/tests/data/gwl.nc")
dcv2 = xr.open_dataset("/p/project/deepacf/maelstrom/emmerich1/data/dcv2-lswrs.nc")

lswrs = [None, gwl["GWL"], results_40["PCA"], results_40["kPCA"], dcv2["DCv2"]]
lswrs = [None, gwl["GWL"], results_40["PCA"], results_40["kPCA"], dcv2["DCv2"]]

In [2]:
@dataclasses.dataclass
class Errors:
    nmae: float
    nrmse: float


def _calculate_nmae_and_nrmse(
    date: pd.Timestamp,
    gs: sklearn.model_selection.GridSearchCV,
    weather_data: list[xr.DataArray],
    turbine: xr.Dataset,
    power_rating: float,
    turbine_variables: _variables.Turbine,
    coordinates: _coordinates.Coordinates,
) -> Errors:
    logger.debug("Evaluating model error for %s", date)

    weather_forecast = [a6.datasets.methods.select.select_for_date(d, date=date) for d in weather_data]
    X_forecast = a6.features.methods.reshape.sklearn.transpose(  # noqa: N806
        *weather_forecast
    )

    turbine_sub = a6.datasets.methods.select.select_for_date(turbine, date=date)[
        turbine_variables.production
    ]
    y_true = a6.features.methods.reshape.sklearn.transpose(turbine_sub)

    if y_true.size < 12:
        logger.warning(
            (
                "Less than 12 time steps for production data for date=%s, "
                "setting errors to NaN"
            ),
            date
        )
        return Errors(np.nan, np.nan)

    y_pred = gs.predict(X_forecast)

    nmae = a6.training.metrics.turbine.calculate_nmae(
        y_true=y_true, y_pred=y_pred, power_rating=power_rating
    )
    nrmse = a6.training.metrics.turbine.calculate_nrmse(
        y_true=y_true, y_pred=y_pred, power_rating=power_rating
    )
    return Errors(nmae=nmae, nrmse=nrmse)


In [5]:
if WORKER_ID is not None and WORKER_ID >= len(turbine_files):
    logger.warning("Exiting: no file to process")
    raise RuntimeError()

coordinates: _coordinates.Coordinates = _coordinates.Coordinates()
turbine_variables: _variables.Turbine = a6.datasets.variables.Turbine()

result = {}

for i, turbine_path in enumerate(turbine_files):
    if WORKER_ID is not None and i != WORKER_ID:
        continue

    logger.info(
        "Processing turbine %i/%i (path=%s)",
        i,
        len(turbine_files),
        turbine_path,
    )

    turbine_name = turbine_path.name.replace(".nc", "")

    turbine_path: pathlib.Path = (
        preprocessed_data_dir / f"{turbine_name}/turbine.nc"
    )
    pl_path: pathlib.Path = (
        preprocessed_data_dir / f"{turbine_name}/pl.nc"
    )
    ml_path: pathlib.Path = (
        preprocessed_data_dir / f"{turbine_name}/ml.nc"
    )
    sfc_path: pathlib.Path = (
        preprocessed_data_dir / f"{turbine_name}/sfc.nc"
    )

    logger.info("Reading preprocessed data")
    turbine = xr.open_dataset(turbine_path)
    pl = xr.open_dataset(pl_path)
    ml = xr.open_dataset(ml_path)
    sfc = xr.open_dataset(sfc_path)

    power_rating = turbine_variables.read_power_rating(turbine)
    logger.info("Extracted power rating %i", power_rating)
    
    # Convert time stamps to dates and create date range
    times_as_dates = a6.utils.times.time_steps_as_dates(turbine, coordinates=coordinates)
    start, end = min(times_as_dates), max(times_as_dates)
    dates = pd.date_range(start, end, freq="1d")
    
    logger.info(
        "Simulating forecast errors for LSWRS %s for date range %s to %s",
        lswrs,
        start,
        end,
    )

    forecast_errors = {}
    
    for lswr in lswrs:
        lswr_name = "none" if lswr is None else lswr.name
        
        logger.info("Handling LSWR %s", lswr_name)
        
        outfile: pathlib.Path = (
            results_dir / f"{turbine_name}-forecast-errors-lswr-{lswr_name}.nc"
        )

        if outfile.exists():
            logger.warning(
                "Skipping %s since outfile already exists at %s",
                turbine_path,
                outfile,
            )
            
        data = [ml[var] for var in ml.data_vars] + [sfc[var] for var in sfc.data_vars] + [pl[var] for var in pl.data_vars]
        categorical_features = [False for _ in enumerate(data)]
        
        if lswr is not None:
            lswr_labels = lswr.sel(time=turbine[coordinates.time], method="pad")
            data.append(lswr_labels)
            categorical_features.append(True)
        
        logger.info(
            "Preparing input data for variables %s", [d.name for d in data]
        )

        X = a6.features.methods.reshape.sklearn.transpose(*data)  # noqa: N806
        y = a6.features.methods.reshape.sklearn.transpose(
            turbine[turbine_variables.production]
        )

        (  # noqa: N806
            X_train,
            _,
            y_train,
            _,
        ) = sklearn.model_selection.train_test_split(X, y, train_size=1/3)

        logger.info(
            "Train dataset size is %i hours (~%i days)",
            y_train.size,
            y_train.size // 24,
        )

        logger.info("Fitting model with GridSearchCV")


        param_grid = {
            "learning_rate": [0.03, 0.05, 0.07, 0.1],
            "l2_regularization": [0.0, 1.0, 3.0, 5.0, 7.0],
            "max_iter": [200, 300, 500],
            "max_depth": [15, 37, 63, 81],
            "min_samples_leaf": [23, 48, 101, 199],
            "categorical_features": [categorical_features],
        }
        #param_grid = {"learning_rate": [0.1]}
        n_jobs = int(a6.utils.get_cpu_count())

        gs = sklearn.model_selection.GridSearchCV(
            estimator=ensemble.HistGradientBoostingRegressor(
                loss="squared_error"
            ),
            param_grid=param_grid,
            scoring=sklearn.metrics.make_scorer(
                a6.training.metrics.turbine.calculate_nrmse,
                greater_is_better=False,
                power_rating=power_rating,
            ),
            # 10-fold CV
            cv=10,
            refit=True,
            n_jobs=n_jobs,
        )
        gs = gs.fit(X=X_train, y=y_train.ravel())

        results: list[Errors] = a6.utils.parallelize.parallelize_with_futures(
            _calculate_nmae_and_nrmse,
            kwargs=[
                dict(
                    date=date,
                    gs=gs,
                    weather_data=data,
                    turbine=turbine,
                    power_rating=power_rating,
                    turbine_variables=turbine_variables,
                    coordinates=coordinates,
                )
                for date in dates[:2]
            ]
        )
        forecast_errors[lswr_name] = results

RANK 4 (LOCAL 4) - INFO - 2024-04-02 13:34:15 - 0:48:47 - Processing turbine 4/45 (path=/p/home/jusers/emmerich1/juwels/data/production/VE229457.nc)
RANK 4 (LOCAL 4) - INFO - 2024-04-02 13:34:15 - 0:48:47 - Reading preprocessed data
RANK 4 (LOCAL 4) - INFO - 2024-04-02 13:34:15 - 0:48:47 - Extracted power rating 3600
RANK 4 (LOCAL 4) - INFO - 2024-04-02 13:34:15 - 0:48:47 - Simulating forecast errors for LSWRS [None, <xarray.DataArray 'GWL' (time: 15777)>
                                                          [15777 values with dtype=int64]
                                                          Coordinates:
                                                            * time     (time) datetime64[ns] 1979-07-01 1979-07-02 ... 2022-10-13, <xarray.DataArray 'PCA' (time: 21826)>
                                                          [21826 values with dtype=int32]
                                                          Coordinates:
                                                

In [6]:
def unpack_errors_per_method(errors, attr: str):
    return [
        [getattr(error, attr) for error in method]
        for method in zip(*errors.values())
    ]

In [51]:
coords = {"time": dates[:2], "lswr_method": list(forecast_errors.keys())}
dims = ["time", "lswr_method"]
                                     
nmae_da = xr.DataArray(
    unpack_errors_per_method(forecast_errors, attr="nmae"),
    coords=coords,
    dims=dims,
)
nrmse_da = xr.DataArray(
    unpack_errors_per_method(forecast_errors, attr="nrmse"),
    coords=coords,
    dims=dims,
)
errors = xr.Dataset(
    data_vars={"nmae": nmae_da, "nrmse": nrmse_da},
    coords=nmae_da.coords,
)
errors