# Training XGBoost to Emulate ec-land

In this notebook we take some example Zarr data (similar to that created by this projects other functionality) and train an ML emulator of the ec-land land surface model. Here we are training on features of climatological, meteorological and previous model state values to predict the 6-hourly model state update 

In [None]:
import random
import xarray as xr
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.metrics import mean_squared_error


# def r2_score_multi(y_pred: np.ndarray, y_true: np.ndarray) -> float:
#     """Calculated the r-squared score between 2 arrays of values

#     :param y_pred: predicted array
#     :param y_true: "truth" array
#     :return: r-squared metric
#     """
#     return r2_score(y_pred.flatten(), y_true.flatten())

In [None]:
# Open up the Zarr data
ds_train = xr.open_zarr("/data/ecland_i6aj_o400_2010_2022_6h_euro.zarr").sel(time=slice("2010", "2019"))
ds_val = xr.open_zarr("/data/ecland_i6aj_o400_2010_2022_6h_euro.zarr").sel(time=slice("2020", "2020"))
# Inspect the concatenated dataset and see available model variables

Next we select the "features" and "targets" that we want to use to use in the construction of our ML model. The we select this subset of variables from the dataset

In [None]:
clim_feat_lst = ['clim_clake',
            'clim_ctype',
            'clim_landsea',
            'clim_cu',
            'clim_cvh',
            'clim_cvl',
            'clim_geopot',
            'clim_sdfor',
            'clim_sdor',
            # 'clim_sotype',
            # 'clim_tvh',
            # 'clim_tvl',
            'clim_theta_cap',
            'clim_theta_pwp',
            'clim_veg_covh',
            'clim_veg_covl',
            'clim_veg_z0mh',
            'clim_veg_z0ml',
            'clim_veg_rsminh',
            'clim_veg_rsminl']


feat_lst = ['lai_hv', 
            'lai_lv', 
            'met_ctpf',
            'met_lwdown',
            'met_psurf',
            'met_qair',
            'met_rainf',
            'met_swdown',
            'met_snowf',
            'met_tair',
            'met_wind_e',
            'met_wind_n',
            'swvl1',
            'swvl2',
            'swvl3',
            'stl1',
            'stl2',
            'stl3',
            'snowc',
           ]

targ_lst = ['swvl1',
            'swvl2',
            'swvl3',
            'stl1',
            'stl2',
            'stl3',
            'snowc',
           ]

def prepare_feats_and_targets(ds):

    # Shift the times of the features/targets so that from the previous state we are predicting the next state
    clim_feats_ds = (ds.sel(clim_variable=clim_feat_lst).clim_data
                    .expand_dims(time=ds.time)
                    .isel(time=slice(0,-1))
                    .stack(z=("x", "time",))
                    .transpose()
                    .rename({"clim_variable": "variable"})
    )
    feats_ds = ds.sel(variable=feat_lst).isel(time=slice(0,-1)).data.stack(z=("x", "time",)).transpose()
    target_ds = ds.sel(variable=targ_lst).data

    # Select the desired variables, convert to an array, stack the 'time' and 'space' dimensions. 
    # For the targets we minus the previous value of the model state so that we are predicting 
    # the 6-hourly model update increments instead of the next absolutle model value.
    feats_ds = xr.concat((clim_feats_ds, feats_ds), dim="variable").chunk({"variable": -1})
    target_ds = target_ds.isel(time=slice(1,None)).stack(z=("x", "time",)).transpose() - target_ds.isel(time=slice(0,-1)).stack(z=("x", "time",)).values.T

    return feats_ds, target_ds

In [None]:
train_feats, train_targets = prepare_feats_and_targets(ds_train)
val_feats, val_targets = prepare_feats_and_targets(ds_val)

n_train = train_feats.shape[0]
n_val = val_feats.shape[0]

print(n_train, n_val)

## Model training with XGBoost

Now we have our "features" and "targets" we can train xgboost to predict our model increments.

In [None]:
def mse(y_pred: np.ndarray, y_true: np.ndarray) -> float:
    return mean_squared_error(y_pred.flatten(), y_true.flatten())

model = xgb.XGBRegressor(
    n_estimators=50,
    tree_method="hist",
    device="cuda",
    objevtive=mse,
    # multi_strategy="multi_output_tree",
    # learning_rate=0.3,
    # eval_metric=mse,
    # subsample=0.01,
    # sampling_method="gradient_based"
)
# fname = "./test.json"

# X_train = train_feats.values
# y_train = train_targets.values
# X_val = val_feats.values
# y_val = val_targets.values

# print("Fitting XGB model...")

# # At once
# model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
# model.save_model(fname)
# y_val_pred = model.predict(X_val)
# val_mse = mse(y_val_pred, y_val)
# print(f"Validation MSE = {val_mse}")

# # Incremental
# batch_size = 20000
# i = 0
# val_mse_curr = float('inf')
# while True:
#     idxs = np.random.choice(n_train, batch_size, replace=False)
#     X_batch = X_train[idxs]
#     y_batch = y_train[idxs]
#     model.fit(X_batch, y_batch, eval_set=[(X_batch, y_batch)], xgb_model=fname if i>0 else None, verbose=False)
#     model.save_model(fname)
#     y_val_pred = model.predict(X_val)
#     if i%5 == 0:
#         val_mse = mse(y_val_pred, y_val)
#         print(f"Epoch {i}: Validation MSE = {val_mse}")
#         if val_mse < val_mse_curr:
#             val_mse_curr = val_mse
#             i+=1
#         else:
#             break

# print("Finished training")