In [None]:
from pathlib import Path

import dask
import numpy as np
import pandas as pd
import xarray as xr
from dask.distributed import Client
from seapopym.configuration.no_transport.parameter import ForcingParameters, ForcingUnit, KernelParameters

from seapopym_optimization import wrapper

User parameters

A batch of 1000 samples takes about 48 seconds to run on my machine.


In [None]:
nb_samples_by_batch = 1000

quantity_of_interest = ["mean", "variance", "argmax"]

time_start = "2005-01-01"
time_start_analysis = "2006-01-01"
time_end = "2007-01-01"

In [None]:
stations_locations = pd.read_json("../1_data_processing/1_3_Sensibility/stations_locations.json")
stations_locations = stations_locations.set_index("name")
stations_locations

In [None]:
multi_index_columns = pd.MultiIndex.from_product(
    [stations_locations.index, quantity_of_interest], names=["station", "quantity_of_interest"]
)
column_index_flatten = pd.Index(
    [f"{station}_{quantity_of_interest}" for station, quantity_of_interest in multi_index_columns], name="station"
)
multi_index_columns

In [None]:
client = Client()
client

Samples (sobol sequence)


In [None]:
input_parameters = pd.read_parquet("./input_samples.parquet")
input_parameters

Setting the output file. This file will be filled batch after batch with QoI values.


In [None]:
output_sobol_index_filepath = Path("./output_sobol_index.parquet")
if output_sobol_index_filepath.exists():
    output_sobol_index = pd.read_parquet(output_sobol_index_filepath)
else:
    output_sobol_index = pd.DataFrame(columns=multi_index_columns)
    output_sobol_index.to_parquet(output_sobol_index_filepath)
output_sobol_index

---

# Cost function definition

Prepare forcing and parameters definition


In [None]:
input_forcing = xr.open_dataset("../1_data_processing/1_3_Sensibility/all_stations.zarr", engine="zarr")
input_forcing = input_forcing.sel(time=slice(time_start, time_end))
input_forcing

In [None]:
FORCING_PARAMETERS = ForcingParameters(
    temperature=ForcingUnit.from_dataset(forcing=input_forcing, name="T", resolution=0.08333, timestep=1),
    primary_production=ForcingUnit.from_dataset(input_forcing, name="npp", resolution=0.08333, timestep=1),
)

In [None]:
def wrapper_model_generator_no_transport(fg_parameters):
    fg_parameters = wrapper.FunctionalGroupGeneratorNoTransport(np.array([fg_parameters]))
    return wrapper.model_generator_no_transport(
        fg_parameters=fg_parameters,
        forcing_parameters=FORCING_PARAMETERS,
    )

Official scoring function


In [None]:
def compute_quantity_of_interest(biomass_forcing_station, station):
    return (
        float(biomass_forcing_station.mean().data),
        float(biomass_forcing_station.var().data),
        int(biomass_forcing_station.argmax("time").data),  # TODO: Compute the DayOfYear of the argmax
    )


@dask.delayed
def cost_function(x: np.ndarray):
    energy_transfert, tr_0, gamma_tr, inv_lambda_0, gamma_inv_lambda = x.T
    fg_parameters = [0, 0, energy_transfert, tr_0, gamma_tr, inv_lambda_0, gamma_inv_lambda]

    model = wrapper_model_generator_no_transport(fg_parameters)

    model.run()
    biomass_forcing = model.export_biomass().sel(time=slice(time_start_analysis, time_end))

    results = []
    for station in stations_locations.index:
        biomass_forcing_station = biomass_forcing.sel(
            latitude=stations_locations.loc[station, "latitude"],
            longitude=stations_locations.loc[station, "longitude"],
            functional_group=0,
        )
        results += compute_quantity_of_interest(biomass_forcing_station, station)

    return results

In [None]:
def batch_cost_function_execution(input_parameters: pd.DataFrame) -> np.ndarray:
    resultats = [cost_function(param) for param in input_parameters.to_numpy()]
    return np.array(dask.compute(*resultats))

Test function


In [None]:
# TEST FUNCTION
# def batch_cost_function_execution(input_parameters: pd.DataFrame) -> np.ndarray:
#     return np.full((input_parameters.shape[0], len(quantity_of_interest)), 1)

---


Run as much batch you can


In [None]:
for batch_number in range(0, (len(input_parameters) // nb_samples_by_batch) + 1):
    min_batch = batch_number * nb_samples_by_batch
    max_batch = min(batch_number * nb_samples_by_batch + nb_samples_by_batch, len(input_parameters))
    print(f"Batch {batch_number} = {min_batch} : {max_batch}")

    if not (max_batch) in output_sobol_index.index:
        batch_samples = input_parameters.iloc[
            batch_number * nb_samples_by_batch : batch_number * nb_samples_by_batch + nb_samples_by_batch
        ]

        results = batch_cost_function_execution(batch_samples)
        results = pd.DataFrame(data=results, columns=multi_index_columns, index=batch_samples.index)

        output_sobol_index = pd.concat([output_sobol_index, results])
        output_sobol_index.to_parquet(output_sobol_index_filepath)

Show output


In [None]:
output_sobol_index