## Testing the newly trained emulator

Now that we have our trained ML model we can test it against some independent data. We open up the ML training database and then apply the model over a complete three year period to test how well it performs. The last two years (2021 and 2022) are completely independent having not been used in the training.

In [None]:
import yaml
import xarray as xr
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from dataset.EclandPointDataset import EcDataset

## Settings

In [None]:
data_path = "/data/ecland_i6aj_o400_2010_2022_6h_euro.zarr"
model_path = "./euro_xgb_train_2019_val_2020_all_variables.json"
result_path = "./euro_xgb_train_2019_val_2020_all_variables.zarr"
spatial_encoding = False
temporal_encoding = False

with open('configs/config.yaml') as stream:
    try:
        CONFIG = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

## Dataset and model to run the inference for

In [None]:
# Dataset
ds_inf = EcDataset(
    start_year = 2020,
    end_year = 2022,
    root = data_path,
    roll_out = 1,
    clim_features=CONFIG["clim_feats"],
    dynamic_features=CONFIG["dynamic_feats"],
    target_prog_features=CONFIG["targets_prog"],
    target_diag_features=CONFIG["targets_diag"],
    is_add_lat_lon = spatial_encoding, 
    is_norm = True, 
    point_dropout = 0.0
)

# XGB Model
model = xgb.XGBRegressor(
    n_estimators=1000,
    tree_method="hist",
    device="cuda",
    # objevtive=mse,
    # multi_strategy="multi_output_tree",
    # learning_rate=0.3,
    # eval_metric=mse,
    # subsample=0.01,
    # sampling_method="gradient_based"
)
model.load_model(model_path)

## Run the inference

This is currently annoyingly slow, which could likely somehow be improved by loading all data into memory first as in the original implementation

In [None]:
# Define function to apply to each model step
def apply_physical_constraints(x_state):
    x_state[:, np.array(CONFIG["targets_prog"])!="e"] = np.clip(x_state[:, np.array(CONFIG["targets_prog"])!="e"], 0, None) # All variables except "e" are positive
    x_state[:,-1] = np.clip(x_state[:,-1], None, 100) # Snow cover cannot be higher than 100
    return x_state

# Initial state
preds = []
_, x_state, _, _, x_clim, _ = ds_inf[0]
x_state, x_clim = x_state.squeeze(), x_clim.squeeze()
preds.append(EcDataset.inv_transform(x_state, ds_inf.y_prog_means, ds_inf.y_prog_stdevs))

# Inference
for i in tqdm(range(len(ds_inf)), desc="Running ECLand emulator..."):
    x_met, _, _, _, _, _ = ds_inf[i]
    x_met = x_met.squeeze()
    X = np.concatenate((x_met, x_state, x_clim), axis=1)
    y_pred = model.predict(X)
    y_state_inc_pred = y_pred[:,:len(CONFIG["targets_prog"])]
    y_state_inc_pred = EcDataset.inv_transform(y_state_inc_pred, ds_inf.y_prog_inc_mean, ds_inf.y_prog_inc_std) # Unnormalize so that it can be added to the normalized state vector
    x_state += y_state_inc_pred
    x_state = apply_physical_constraints(EcDataset.inv_transform(x_state, ds_inf.y_prog_means, ds_inf.y_prog_stdevs)) # Unnormalize updated state vector and apply consistency constraints
    preds.append(x_state)
    x_state = EcDataset.transform(x_state, ds_inf.y_prog_means, ds_inf.y_prog_stdevs) # Re-normalize state vector for next iteration

## Save results

In [None]:
preds = xr.DataArray(
    data=np.stack(preds),
    coords = {"x":ds_inf.ds_ecland["x"], "time":ds_inf.times, "variable":CONFIG["targets_prog"]},
    dims = ["time", "x", "variable"],
    name = "data"
)
preds = preds.assign_coords(lon=("x", ds_inf.lon))
preds = preds.assign_coords(lat=("x", ds_inf.lat))
preds = preds.to_dataset()
preds.to_zarr(result_path)

## Plotting example results

Now we have 3-year's of output from the ec-land model and from our ML emulator, "ai-land", we can plot both trajectories to see how the ML model performs. As it has been trained on the first 2-years we expect it to fit here. However, the final year was left out of training and so is independent, we see the ML model still performs very well even in this independent year. 

It is also worth noting that here we have performed a complete 3-year run with the "ai-land" model (~4300 applications) and we find very little compounding errors, allowing us to accurately capture the full ec-land climatology.

In [None]:
true = xr.open_zarr(data_path).sel(time=slice("2020", "2022")).data
pred = xr.open_zarr(result_path).sel(time=slice("2020", "2022")).data

def find_nearest_idx(
    arr1: np.ndarray,
    arr2: np.ndarray,
    val1: float,
    val2: float,
) -> int:
    """Find first nearest index for a given tolerance for two arrays and 2 values

    :param arr1: first array
    :param arr2: second arrat
    :param val1: value to find in first array
    :param val2: value to find in second array
    :return: index as int
    """
    return (np.abs(arr1 - val1) + np.abs(arr2 - val2)).argmin()

lat, lon = 50.72, 7.11
x_idx = find_nearest_idx(true.lat, true.lon, lat, lon).values

def ailand_plot(var_name, label=None, test_date="2021-01-01"):
    """Plotting function for the ec-land database and ai-land model output

    :param var_name: parameter variable name
    :param ax: the axes to plot on
    :param ylabel: y-label for plot
    :param ax_title: title for plot
    :param test_date: date to plot vertical line (train/test split), defaults to "2021-01-01"
    :return: plot axes
    """

    fig = plt.figure(figsize=(9,4))
    true.isel(x=x_idx).sel(variable=var_name).plot(label="ec-land", ax=plt.gca())
    pred.isel(x=x_idx).sel(variable=var_name).plot(label="ai-land", ax=plt.gca())
    
    plt.gca().axvline(pred.sel(time=test_date).time.values[0], color="k", linestyle="--")
    plt.gca().set_xlim(pred.time.values[[0,-1]])
    plt.gca().set_ylabel(label)
    plt.show()
    
    return

for var in CONFIG["targets_prog"]:
    ailand_plot(var)

## Feature importances

In [None]:
model.get_booster().feature_names = CONFIG["dynamic_feats"] + CONFIG["targets_prog"] + CONFIG["clim_feats"]
xgb.plot_importance(model)