# Creating a monthly NEE dataset, using CarbonTracker model

This notebook outlines the workflow for creating a monthly NEE dataset. You first need to have run the CarbonTracker workflow to train and create the model. 

To create the dataset you need:

- CarbonTracker model
- ERA5 (monthly)
- ERA5-land (monthly)
- SPEI (monthly)
- MODIS (monthly)
- Biomass (yearly)
- Copernicus Landcover (yearly)


In [7]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr
from dask.distributed import Client
from onnxruntime import InferenceSession

import excited_workflow
from excited_workflow.source_datasets import datasets


client = Client()

In [8]:
cb_file= Path("/data/volume_2/EXCITED_prepped_data/CT2022.flux1x1-monthly.nc")
regions_file = Path("/data/volume_2/EXCITED_prepped_data/regions.nc")
home_path = Path.home()
output_dir = sorted(list(home_path.rglob("carbon_tracker-*")))[-1]

desired_data = [
    "biomass",
    "spei",
    "modis",
    "era5_monthly",
    "era5_land_monthly",
    "copernicus_landcover"
]

x_keys = ["d2m", "mslhf", "msshf", "ssr", "str", "t2m", "spei", "NIRv", "skt",
            "stl1", "swvl1", "lccs_class"]
y_key = "bio_flux_opt"

In [9]:
ds_cb = xr.open_dataset(cb_file)
ds_cb = excited_workflow.utils.convert_timestamps(ds_cb)
ds_regions = xr.open_dataset(regions_file)
ds_input = xr.merge(
    [
        datasets[name].load(freq="monthly", target_grid=ds_cb)
        for name in desired_data
    ]
)

In [10]:
def run_model(output_dir, df, x_keys):
    """Open model and run it."""
    with open(output_dir / "lightgbm.onnx", "rb") as f:
        model = f.read()

    sess = InferenceSession(model)
    predictions_onnx = sess.run(None, {'X': df[x_keys].to_numpy()})[0]

    return predictions_onnx

In [11]:
ds_regions = xr.open_dataset(regions_file)
ds_cb = excited_workflow.utils.convert_timestamps(ds_cb)
dsx = ds_input[x_keys]
ds_merge = xr.merge([dsx, ds_regions["transcom_regions"]])

In [94]:
allnan = ds_merge.isnull().all(dim=["latitude","longitude"]).compute()

In [95]:
predictions = []
times = []
for idx, dtime in enumerate(ds_merge["time"]):
    ds_sel = ds_merge.isel(time=idx)
    if not any([allnan.isel(time=idx)[var] for var in allnan.data_vars]):
        ds_sel = ds_sel.compute()
        ds_na = ds_sel.where(ds_merge["transcom_regions"] == 2)
        df_sel = ds_na.to_dataframe().dropna()
        predictions.append(run_model(output_dir, df_sel, x_keys))
        times.append(dtime.to_numpy())


In [92]:
dfs = [pd.DataFrame(data=pred, index=df_sel.index, columns=["NEE"]) for pred in predictions]
dss = [df.to_xarray().sortby(["latitude","longitude"]) for df in dfs]
ds_out = xr.concat(dss, dim="time")
ds_out["time"] =np.array(times)


In [93]:
ds_out["NEE"].attrs = {}