# Relation of power production from renewables to LSWRs


In [11]:
%load_ext autoreload
%autoreload 2
import pathlib

import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd

import a6
import a6.plotting._colors as _colors

a6.utils.logging.create_logger(
    global_rank=0,
    local_rank=0,
    verbose=False,
)

path = pathlib.Path(
    "/p/project1/deepacf/emmerich1/data/ecmwf_era5/era5_pl_1964_2023_12.nc"
)
plots = pathlib.Path("/p/project1/deepacf/emmerich1/plots/paper-1")
pca_dir = pathlib.Path("/p/scratch/deepacf/emmerich1/pca")
kmeans_dir = pathlib.Path("/p/project1/deepacf/emmerich1/data/kmeans")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
%%time

ds = xr.open_dataset(path)

coordinates = a6.datasets.coordinates.Coordinates()
variables = a6.datasets.variables.Model()

ds

CPU times: user 525 ms, sys: 114 ms, total: 639 ms
Wall time: 2.22 s


## Plotting

In [62]:
import matplotlib.ticker


results = xr.open_dataset(
    "/p/project1/deepacf/emmerich1/data/pca_kpca_kmeans_lswrs_30_40.nc"
)

n_lswr_categories = 30
results_pca_kmeans = results.sel(k=n_lswr_categories)
gwl = xr.open_dataset(
    "/p/home/jusers/emmerich1/juwels/code/a6/src/tests/data/gwl.nc"
)

# NOTE: Uncomment to save dcv2 to disk, or read from disk
# dcv2 = torch.load(
#     "/p/project1/deepacf/emmerich1/dcv2/multi-level-all-fields-1964-2023-1200-epochs-k-30/epoch-799-assignments.pt",
#     map_location=torch.device("cpu"),
# )
# dcv2 = xr.DataArray(
#     # Need to add +1 to be conform with GWL labels
#     dcv2.numpy()[0] + 1,
#     name="DCv2",
#     coords={"time": results["time"]},
#     dims=["time"],
# )
# dcv2.to_netcdf("/p/project1/deepacf/emmerich1/data/dcv2-lswrs.nc")

dcv2 = xr.open_dataset("/p/project1/deepacf/emmerich1/data/dcv2-lswrs.nc")[
    "DCv2"
]

lswrs = [gwl["GWL"], dcv2, results_pca_kmeans["PCA"]]


# Set ggplot styles and update Matplotlib with them.
def grey_background(ax):
    # Give plot a gray background like ggplot.
    ax.set_facecolor("#EBEBEB")
    # Remove border around plot.
    [ax.spines[side].set_visible(False) for side in ax.spines]


def plot_grid_with_grey_background(
    ax, minor_ticks_x: int = 0, minor_ticks_y: int = 0
):
    grey_background(ax)

    ax.grid(which="major", color="white", linewidth=1)
    # Show the minor ticks and grid.
    ax.minorticks_on()

    if minor_ticks_x > 0:
        ax.xaxis.grid(which="minor", color="white", linewidth=0.5)
        # Now hide the minor ticks (but leave the gridlines).
        ax.tick_params(axis="x", which="minor", bottom=False, left=False)
        # Only show minor gridlines once in between major gridlines.
        ax.xaxis.set_minor_locator(
            matplotlib.ticker.AutoMinorLocator(minor_ticks_x + 1)
        )

    if minor_ticks_y > 0:
        ax.yaxis.grid(which="minor", color="white", linewidth=0.5)
        ax.tick_params(axis="y", which="minor", bottom=False, left=False)
        ax.yaxis.set_minor_locator(
            matplotlib.ticker.AutoMinorLocator(minor_ticks_y + 1)
        )

## Clean production data for plotting

In [None]:
%%time

"""
NOTE: Preprocess turbine data. This takes ~22 min, so prefer loading (cell below)
if no changes to the preprocessing are required.
"""

paths = list(
    pathlib.Path("/p/home/jusers/emmerich1/juwels/data/production").glob(
        "**/*.nc"
    )
)
print(paths)


def remove_outliers(data: xr.Dataset) -> xr.Dataset:
    power_rating = float(data.attrs["power rating"].split()[0])
    print(power_rating)
    # Only use data points where
    # - production is lower than power rating
    # - production is greater than 0
    return a6.datasets.methods.turbine.clean_production_data(
        power_rating=power_rating,
    ).apply_to(data)


# Contains the turbine name and the production
turbines: dict[str, xr.Dataset] = {
    path.name: remove_outliers(xr.open_dataset(path)) for path in paths
}

# NOTE: Uncomment to save preprocessed data to disk.

for name, data in turbines.items():
    data.to_netcdf(
        f"/p/home/jusers/emmerich1/juwels/data/production-cleaned-for-analysis/{name}.nc"
    )

In [None]:
# Open preprocessed turbine data if available
paths = list(
    pathlib.Path(
        "/p/home/jusers/emmerich1/juwels/data/production-cleaned-for-analysis"
    ).glob("**/*.nc")
)
# Contains the turbine name and the production
turbines: dict[str, xr.Dataset] = {
    path.name: xr.open_dataset(path) for path in paths
}

## Relation of LSWRs to Power Production

In [None]:
%%time

import dataclasses
import datetime


@dataclasses.dataclass
class PowerPerMode:
    label: int
    measurements: list[int] = dataclasses.field(default_factory=list)
    sum: list[float] = dataclasses.field(default_factory=list)
    mean: list[float] = dataclasses.field(default_factory=list)
    std: list[float] = dataclasses.field(default_factory=list)
    normalized_mean: list[float] = dataclasses.field(default_factory=list)
    normalized_std: list[float] = dataclasses.field(default_factory=list)


def get_power_per_lswr(data: xr.Dataset) -> dict[int, PowerPerMode]:
    modes = a6.modes.methods.determine_lifetimes_of_modes(data)
    dates = [xr.DataArray(list(mode.get_dates())) for mode in modes]

    power_per_mode = {
        mode.label: PowerPerMode(label=mode.label) for mode in modes
    }

    for i, (name, turbine) in enumerate(turbines.items()):
        print(f"{data.name}: {i}/{len(turbines)}", end="\r")
        power_rating = float(turbine.attrs["power rating"].split()[0])

        # Resample to daily production and calculate sum, relative mean and std
        resampled = turbine["production"].resample({"time": "1d"}, skipna=True)
        daily_sum = resampled.sum(skipna=True)
        daily_mean = resampled.mean(skipna=True)
        daily_mean_normalized = daily_mean / power_rating
        daily_std = resampled.std(skipna=True)
        daily_std_normalized = daily_std / power_rating

        for mode, date in zip(modes, dates, strict=True):
            mode_power = power_per_mode[mode.label]

            # Get time steps of production where LSWR appeared
            intersection = sorted(set(daily_sum.time.values) & set(date.values))

            # if not intersection:
            #     print(f"WARNING: empty intersection for {name} and mode {mode.label}")
            #     continue

            # Count number of days that contribute to the results
            mode_power.measurements.append(len(intersection))

            # Select time steps of LSWR appearance and calculate sum
            total = daily_sum.sel(time=intersection)
            mode_power.sum.extend(total.values.flatten().tolist())

            # Select time steps of LSWR appearance and calculate mean
            mean = daily_mean.sel(time=intersection)
            mean_normalized = daily_mean_normalized.sel(time=intersection)
            mode_power.mean.extend(mean.values.flatten().tolist())
            mode_power.normalized_mean.extend(
                mean_normalized.values.flatten().tolist()
            )

            # Select time steps of LSWR appearance and calculate std
            std = daily_std.sel(time=intersection)
            std_normalized = daily_std_normalized.sel(time=intersection)
            mode_power.std.extend(std.values.flatten().tolist())
            mode_power.normalized_std.extend(
                std_normalized.values.flatten().tolist()
            )
    return power_per_mode


power_per_method = {lswr.name: get_power_per_lswr(lswr) for lswr in lswrs}

In [None]:
import pandas as pd

latex_code = []


def power_mean_with_std_as_string(stats: list[float]) -> str:
    return f"${np.nanmean(stats) * 100:.2f} \pm {np.nanstd(stats) * 100:.2f}$"


columns = {
    "$N$": lambda lswrs: [
        np.nansum(results.measurements) for results in lswrs.values()
    ],
    "$P_{\mathrm{total}}$ [kW]": lambda lswrs: [
        f"{int(np.nansum(results.sum)):d}" for results in lswrs.values()
    ],
    "$P^{\mathrm{mean}}_{\mathrm{normalized}}$ [\%]": lambda lswrs: [
        power_mean_with_std_as_string(results.normalized_mean)
        for results in lswrs.values()
    ],
    "$P^{\mathrm{std}}_{\mathrm{normalized}}$ [\%]": lambda lswrs: [
        power_mean_with_std_as_string(results.normalized_std)
        for results in lswrs.values()
    ],
}

reform = {
    (name, column): func(method)
    for name, method in power_per_method.items()
    for column, func in columns.items()
}

df = pd.DataFrame.from_dict(reform)

# Add 1 to start indexing at 1 to be conform with LSWR labels
df.index += 1

code = df.to_latex(
    float_format="%.2f",
    label="production-per-lswr-per-method",
    caption=f"Power production for the resulting LSWRs.",
)


with open(
    "/p/home/jusers/emmerich1/juwels/code/a6/notebooks/power-production-table.tex",
    "w",
) as f:
    f.write(code)

df

In [None]:
import matplotlib.pyplot as plt


def plot_power_per_mode(
    powers: dict[str, dict[int, PowerPerMode]],
) -> tuple[plt.Figure, plt.Axes]:
    """Plot the power production for each LSWR and method."""
    n_rows = len(powers)
    n_cols = 1
    labels = np.arange(1, n_lswr_categories + 1, dtype=int)

    x_lims = labels.min() - 0.5, labels.max() + 0.5
    colors = _colors.create_colors_for_labels(labels)

    fig, axs = plt.subplots(
        figsize=(6 * n_cols, 2 * n_rows),
        nrows=n_rows,
        ncols=n_cols,
        sharex=True,
        sharey=True,
    )

    plt.title(f"Power production per LSWRs")

    for i, (method, powers_per_lswr) in enumerate(powers.items()):
        powers_means = [
            np.nanmean(power.normalized_mean) * 100
            for power in powers_per_lswr.values()
        ]
        powers_stds = [
            np.nanstd(power.normalized_mean) * 100
            for power in powers_per_lswr.values()
        ]
        ax = axs[i]

        ax.bar(
            labels,
            powers_means,
            yerr=powers_stds,
            width=1.0,  # removes gaps between the bars
            color=colors,
            align="center",
            alpha=1,
            ecolor="black",
            capsize=3,
        )
        parts = axs[i].violinplot(
            durations,
            widths=1,
            showmeans=True,
            showmedians=True,
            showextrema=False,
            quantiles=[[0.95] for _ in enumerate(durations)],
        )

        parts["cmeans"].set_color("red")
        parts["cmedians"].set_color("orange")
        # parts["cmins"].set_alpha(0.0)
        parts["cquantiles"].set_color("blue")

        for color, pc in zip(colors, parts["bodies"]):
            pc.set_facecolor("black")
            # pc.set_edgecolor(color)
            pc.set_alpha(0.5)

        ax.set_title(method)

        if i == n_rows - 1:
            ax.set_xlabel("LSWR")

        ax.set_ylabel(r"$P^{\mathrm{mean}}_{\mathrm{normalized}}$ [%]")
        ax.set_xlim(*x_lims)
        ax.set_xticks(labels)
        ax.set_xticklabels(labels, rotation=90)
        ax.yaxis.grid(True)

    fig.tight_layout()
    plt.savefig(plots / "lswrs-power-production-comparison.pdf")


plot_power_per_mode(power_per_method)