## **Open Rituals**

Import needed package and define paths and useful functions

In [1]:
#%% open rituals ##############################################################
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from pandas import MultiIndex, Int64Index
from optuna.samplers import TPESampler
from SALib.sample import saltelli
from SALib.analyze import sobol
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import optuna
import spotpy
import shap
import math
import os

## define paths and functions ################################################
### replace this main directory with your own
Path_Main = r'C:\Users\lli55\Desktop\Lingbo Li PhD\DOC project\Model_with_SoilGrid'
Path_Plot = os.path.join(Path_Main, 'plot')
Path_Output = os.path.join(Path_Main, 'output')
Path_Shape = os.path.join(Path_Main, 'shape')
Path_Input = os.path.join(Path_Main, 'input')

## matrics to use
def kge_2009(preds, Dtrain):
    y = Dtrain.get_label()
    y_1 = [1e-6*i for i in y]
    preds_1 = [1e-6*i for i in preds]
    kge = spotpy.objectivefunctions.kge(y_1, preds_1)
    if math.isnan(kge):
        kge_1 = -9999
    else:
        kge_1 = kge
    return 'kge', kge_1

def nrmse(preds, Dtrain):
    y = Dtrain.get_label()
    y_1 = [1e-6*i for i in y]
    preds_1 = [1e-6*i for i in preds]
    nrmse = spotpy.objectivefunctions.rrmse(y_1, preds_1)
    if math.isnan(nrmse):
        nrmse_1 = -9999
    else:
        nrmse_1 = nrmse
    return 'nrmse', nrmse_1


def mase(preds, Dtrain): 
    y = Dtrain.get_label()
    y_1 = np.array([1e-6*i for i in y])
    preds_1 = np.array([1e-6*i for i in preds])
    mae = np.mean(np.abs(y_1 - preds_1)) 
    gm = np.exp(np.mean(np.log(y_1))) 
    # rmse = spotpy.objectivefunctions.rmse(y_1, preds_1)
    mase = mae / gm
    return 'mase', mase	

  from pandas import MultiIndex, Int64Index
  from .autonotebook import tqdm as notebook_tqdm
  from pandas import MultiIndex, Int64Index


In [14]:
data = pd.read_csv(os.path.join(Path_Input, 'train_set.txt'), delimiter='\t')
data = data.dropna(subset = 'pr_soilgrid').reset_index(drop = True)
print(np.quantile(data.pr_soilgrid, [0.05, 0.95]))


[5.84368011e-05 1.28810757e-03]


In [21]:
data = pd.read_csv(os.path.join(Path_Output, 'performance_12.csv'))
data_train = data.dropna(subset = 'Y_train').reset_index(drop = True)
data_test = data.dropna(subset = 'Y_test').reset_index(drop = True)
print(np.quantile(data_train.Y_train, [0.1, 0.9]))
print(np.quantile(data_test.Y_test, [0.1, 0.9]))
print(len(data_train), len(data_test))

[7.92744624e-05 8.42482632e-04]
[7.42472628e-05 9.05163255e-04]
1808 775


## **Model training, application, and analysis**

- The feature selection process identified the following key predictors: ['TOT_A', 'TOT_NLCD01_90', 'TOT_CONTACT', 'TOT_B', 'TOT_I', 'TOT_BFI', 'TOT_E', 'TOT_CLAYAVE', 'TOT_HGB', 'TOT_NLCD01_42', 'TOT_NLCD01_95', 'TOT_CNPY11_BUFF100', 'TOT_HGBD'].

- However, 'TOT_NLCD01_95' did not meet the representativeness criteria and will therefore be excluded from the final model training.

- To assess the impact of this exclusion, we will compare the model's performance before (Model with 13 features) and after (Model with 12 features) removing this predictor.

- Evaluate the model performance over training, testing and evaluation

- Apply the final model to make a prediction over 2.6 million NHDPlus local catchments

- Analysis the feature importance and the sensitivity

### **Model with 13 features**

In [None]:
## Hyperparameter tuning using optuna #########################################
selected_feature = ['TOT_A', 'TOT_NLCD01_90', 'TOT_CONTACT', 'TOT_B', 'TOT_I', 'TOT_BFI', 'TOT_E', 'TOT_CLAYAVE', 'TOT_HGB', 'TOT_NLCD01_42', 'TOT_NLCD01_95', 'TOT_CNPY11_BUFF100', 'TOT_HGBD']

## power transformation has already performed to predictors, pr is in its orginal value
data = pd.read_csv(os.path.join(Path_Input, 'train_set.txt'), delimiter='\t')
data = data.dropna(subset = 'pr_soilgrid').reset_index(drop = True)
Xx = data[selected_feature] 
## target variable has been transformed to 1e6 times its orginal value, as matrics mase favors large value
Y = 1e6*data.pr_soilgrid

X_train, X_test, Y_train, Y_test = train_test_split(Xx, Y, test_size=0.3, random_state=1)
Dtrain = xgb.DMatrix(X_train, label = Y_train, missing = np.nan)
Dtest = xgb.DMatrix(X_test, label = Y_test, missing = np.nan)

### define the objective funtion for optuna ###################################
### those hyperparameter ranges could be adjusted, the following are chosen for model training using MASE.
def objective_xgb_mase(trial):     
    param = {
        'booster':'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-2, 1), # default value = 1
        'alpha': trial.suggest_float('alpha', 1e-3, 1e-1), # default value = 0
        'gamma': trial.suggest_float('gamma', 1e-3, 1e-1), # default value = 0
        'eta': trial.suggest_float('eta', 1e-1, 5e-1), # default value = 0.3
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-2, 1), # default value = 1
        'colsample_bytree': trial.suggest_float('colsample_bytree', 5e-1, 1), # default value = 1
        'subsample': trial.suggest_float('subsample', 5e-1, 1), # default value = 1
        'max_depth': trial.suggest_int('max_depth', 3, 12), # default value = 6 
        #'objective': 'reg:absoluteerror',
        'disable_default_eval_metric':1
    }
    xgb_cv_results = xgb.cv(param,
                    Dtrain,
                    num_boost_round=800,
                    seed=42,
                    nfold=5,
                    maximize = False, 
                    feval = mase,   
                    callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
                               xgb.callback.EarlyStopping(rounds = 80,
                                                          metric_name = 'mase',
                                                          maximize = False)],   
                    verbose_eval=False)     
                      
    mase_ = xgb_cv_results.iloc[-1]['test-mase-mean'] 
    trial.set_user_attr('n_estimators', len(xgb_cv_results))                      
    return mase_

## optuna learning process ####################################################
optuna.logging.set_verbosity(optuna.logging.WARNING)
# pay attention to maximize or minimize
study = optuna.create_study(direction='minimize',sampler=TPESampler(seed=0)) 
study.optimize(objective_xgb_mase, n_trials=500, show_progress_bar=True)
print('Number of finished trials:', len(study.trials))
best_param = study.best_trial.params
best_param['booster'] = 'gbtree'
best_param['disable_default_eval_metric'] = 1
n_estimators = study.best_trial.user_attrs['n_estimators']
early_stop = int(n_estimators/10)
## save the best model hyperparameter and n_estimator for later use
print(best_param) 
print(n_estimators)

In [None]:
## Train the model using best model hyperparameters ###########################
optimised_xgb = xgb.train(
                          best_param, 
                          Dtrain, 
                          num_boost_round=n_estimators, 
                          evals = [(Dtrain, 'eval_train'), (Dtest, 'eval_test')],
                          feval=mase, # be consistent with your define function name
                          maximize = False,  # Turn it to Ture if doing maximizing       
                          callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
                                     xgb.callback.EarlyStopping(rounds = early_stop,
                                                                metric_name = 'mase', # be consistent with your define function name
                                                                maximize = False)],  # Turn it to Ture if doing maximizing 
                          verbose_eval=False
                          ) 

## make prediction on training and testing data ##############################
predicted_mean_train = optimised_xgb.predict(Dtrain, iteration_range=(0, optimised_xgb.best_iteration+1))
predicted_mean_test  = optimised_xgb.predict(Dtest, iteration_range=(0, optimised_xgb.best_iteration+1))

## evaluate the model performance on training and testing data
print('trainning_kge:' + str(kge_2009(predicted_mean_train, Dtrain)))
print('testing_kge:' + str(kge_2009(predicted_mean_test, Dtest)))
print('trainning_nrmse:' + str(nrmse(predicted_mean_train, Dtrain)))
print('testing_nrmse:' + str(nrmse(predicted_mean_test, Dtest)))
print('trainning_MASE' + str(mase(predicted_mean_train, Dtrain)))
print('testing_MASE:' + str(mase(predicted_mean_test, Dtest)))

## evauate the model performance over evaluation catchments ###################
## power transformation has already performed to predictors, pr is in its orginal value
data_eval = pd.read_csv(os.path.join(Path_Input, 'eval_set.txt'), delimiter='\t')
data_eval = data_eval.dropna(subset = 'pr_soilgrid').reset_index(drop = True)
X_val = data_eval[selected_feature]
Y_val = 1e6*data_eval.pr_soilgrid
Deval = xgb.DMatrix(X_val,  missing=np.nan)
predicted_mean_eval  = optimised_xgb.predict(Deval, iteration_range=(0, optimised_xgb.best_iteration+1))

## save those model predictions for later plotting
performance = {
    'Y_train': Y_train.values/1e6,
    'Y_test': Y_test.values/1e6,  
    'Y_val': Y_val.values/1e6,
    'Predict_train_13':predicted_mean_train/1e6,
    'Predict_test_13':predicted_mean_test/1e6,
    'Predict_eval_13':predicted_mean_eval/1e6
}

df_performance = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in performance.items()]))
df_performance.to_csv(os.path.join(Path_Output, 'performance_13.csv'), index = None)

## Feature importance analysis using SHAP ####################################
explainer = shap.TreeExplainer(optimised_xgb)
shap_values = explainer.shap_values(Xx)
shap_sum = np.abs(shap_values).mean(axis=0)

## save feature importancce for later plotting
importance = {
    'predictors': selected_feature,
    'SHAP_mean_13': shap_sum/1e6
}
df_importance = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in importance.items()]))
df_importance.to_csv(os.path.join(Path_Output, 'importance_13.csv'), index = None)

### **Final Model with 12 features**
No obvious change of model performance after dropping the 'TOT_NLCD01_95', therefore 12 feature are selected for final optimal model 

#### Hyperparameter tuning using optuna

In [None]:
## Hyperparameter tuning using optuna #########################################
selected_feature = ['TOT_A', 'TOT_NLCD01_90', 'TOT_CONTACT', 'TOT_B', 'TOT_I', 'TOT_BFI', 'TOT_E', 'TOT_CLAYAVE', 'TOT_HGB', 'TOT_NLCD01_42', 'TOT_CNPY11_BUFF100', 'TOT_HGBD']

## power transformation has already performed to predictors, pr is in its orginal value
data = pd.read_csv(os.path.join(Path_Input, 'train_set.txt'), delimiter='\t')
data = data.dropna(subset = 'pr_soilgrid').reset_index(drop = True)
Xx = data[selected_feature] 
## target variable has been transformed to 1e6 times its orginal value, as matrics mase favors large value
Y = 1e6*data.pr_soilgrid

X_train, X_test, Y_train, Y_test = train_test_split(Xx, Y, test_size=0.3, random_state=1)
Dtrain = xgb.DMatrix(X_train, label = Y_train, missing = np.nan)
Dtest = xgb.DMatrix(X_test, label = Y_test, missing = np.nan)

### define the objective funtion for optuna ###################################
### those hyperparameter ranges could be adjusted, the following are chosen for model training using MASE.
def objective_xgb_mase(trial):     
    param = {
        'booster':'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-2, 1), # default value = 1
        'alpha': trial.suggest_float('alpha', 1e-3, 1e-1), # default value = 0
        'gamma': trial.suggest_float('gamma', 1e-3, 1e-1), # default value = 0
        'eta': trial.suggest_float('eta', 1e-1, 5e-1), # default value = 0.3
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-2, 1), # default value = 1
        'colsample_bytree': trial.suggest_float('colsample_bytree', 5e-1, 1), # default value = 1
        'subsample': trial.suggest_float('subsample', 5e-1, 1), # default value = 1
        'max_depth': trial.suggest_int('max_depth', 3, 12), # default value = 6 
        #'objective': 'reg:absoluteerror',
        'disable_default_eval_metric':1
    }
    ## To monitor the progress, add 'xgb.callback.EvaluationMonitor(show_stdv=False)' into callbacks
    xgb_cv_results = xgb.cv(param,
                    Dtrain,
                    num_boost_round=800,
                    seed=42,
                    nfold=5,
                    maximize = False, 
                    feval = mase,   
                    callbacks=[xgb.callback.EarlyStopping(rounds = 80,
                                                          metric_name = 'mase',
                                                          maximize = False)],   
                    verbose_eval=False)     
                      
    mase_ = xgb_cv_results.iloc[-1]['test-mase-mean'] 
    trial.set_user_attr('n_estimators', len(xgb_cv_results))                      
    return mase_

## optuna learning process ####################################################
optuna.logging.set_verbosity(optuna.logging.WARNING)
# pay attention to maximize or minimize
study = optuna.create_study(direction='minimize',sampler=TPESampler(seed=0)) 
study.optimize(objective_xgb_mase, n_trials=500, show_progress_bar=True)
print('Number of finished trials:', len(study.trials))
best_param = study.best_trial.params
best_param['booster'] = 'gbtree'
best_param['disable_default_eval_metric'] = 1
n_estimators = study.best_trial.user_attrs['n_estimators']
early_stop = int(n_estimators/10)
## save the best model hyperparameter and n_estimator for later use
print(best_param) 
print(n_estimators)

#### Final model training

In [None]:
## Train the model using best model hyperparameters ###########################
selected_feature = ['TOT_A', 'TOT_NLCD01_90', 'TOT_CONTACT', 'TOT_B', 'TOT_I', 'TOT_BFI', 'TOT_E', 'TOT_CLAYAVE', 'TOT_HGB', 'TOT_NLCD01_42', 'TOT_CNPY11_BUFF100', 'TOT_HGBD']

## best parameter returned from hyperparameter tuning
best_param = {'lambda': 0.8497244598535406, 'alpha': 0.0219789569818175, 'gamma': 0.09045149625652132, 'eta': 0.11455856438869257, 'min_child_weight': 0.31227906516564546, 'colsample_bytree': 0.5004486262490111, 'subsample': 0.9729520009435804, 'max_depth': 8, 'booster': 'gbtree', 'disable_default_eval_metric': 1}
n_estimators = 20
early_stop = int(n_estimators/10)

data = pd.read_csv(os.path.join(Path_Input, 'train_set.txt'), delimiter='\t')
data = data.dropna(subset = 'pr_soilgrid').reset_index(drop = True)
Xx = data[selected_feature]
Y = 1e6*data.pr_soilgrid
X_train, X_test, Y_train, Y_test = train_test_split(Xx, Y, test_size=0.3, random_state=1)
Dtrain = xgb.DMatrix(X_train, label = Y_train, missing = np.nan)
Dtest = xgb.DMatrix(X_test, label = Y_test, missing = np.nan)

optimised_xgb = xgb.train(
                          best_param, 
                          Dtrain, 
                          num_boost_round=n_estimators, 
                          evals = [(Dtrain, 'eval_train'), (Dtest, 'eval_test')],
                          feval=mase, # be consistent with your define function name
                          maximize = False,  # Turn it to Ture if doing maximizing       
                          callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
                                     xgb.callback.EarlyStopping(rounds = early_stop,
                                                                metric_name = 'mase', # be consistent with your define function name
                                                                maximize = False)],  # Turn it to Ture if doing maximizing 
                          verbose_eval=False
                          ) 

#### Performances over training/testing/eval

In [None]:
## make prediction on training and testing data ##############################
predicted_mean_train = optimised_xgb.predict(Dtrain, iteration_range=(0, optimised_xgb.best_iteration+1))
predicted_mean_test  = optimised_xgb.predict(Dtest, iteration_range=(0, optimised_xgb.best_iteration+1))

## evaluate the model performance on training and testing data
print('trainning_kge:' + str(kge_2009(predicted_mean_train, Dtrain)))
print('testing_kge:' + str(kge_2009(predicted_mean_test, Dtest)))
print('trainning_nrmse:' + str(nrmse(predicted_mean_train, Dtrain)))
print('testing_nrmse:' + str(nrmse(predicted_mean_test, Dtest)))
print('trainning_MASE' + str(mase(predicted_mean_train, Dtrain)))
print('testing_MASE:' + str(mase(predicted_mean_test, Dtest)))

## evauate the model performance over evaluation catchments ###################
## power transformation has already performed to predictors, pr is in its orginal value
data_eval = pd.read_csv(os.path.join(Path_Input, 'eval_set.txt'), delimiter='\t')
data_eval = data_eval.dropna(subset = 'pr_soilgrid').reset_index(drop = True)
X_val = data_eval[selected_feature]
Y_val = 1e6*data_eval.pr_soilgrid
Deval = xgb.DMatrix(X_val,  missing=np.nan)
predicted_mean_eval  = optimised_xgb.predict(Deval, iteration_range=(0, optimised_xgb.best_iteration+1))

## save those model predictions for later plotting
performance = {
    'Y_train': Y_train.values/1e6,
    'Y_test': Y_test.values/1e6,  
    'Y_val': Y_val.values/1e6,
    'Predict_train_12':predicted_mean_train/1e6,
    'Predict_test_12':predicted_mean_test/1e6,
    'Predict_eval_12':predicted_mean_eval/1e6
}

df_performance = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in performance.items()]))
df_performance.to_csv(os.path.join(Path_Output, 'performance_12.csv'), index = None)

#### Predict over 2.6 million NHDPlus local catchments

In [None]:
attr_all = pd.read_csv(os.path.join(Path_Input, 'predict_set.txt'), delimiter= '\t')
X_predict = attr_all[selected_feature]
Dtrain_predict = xgb.DMatrix(X_predict,  missing=np.nan)
pr_conus  = optimised_xgb.predict(Dtrain_predict, iteration_range=(0, optimised_xgb.best_iteration+1))
pr_conus_df = pd.DataFrame({'COMID': attr_all['COMID'],
                      'pr': 1e-6*pr_conus})
pr_conus_df.to_csv(os.path.join(Path_Output, 'pr_at_2_6m.txt'), sep = '\t', index = None) 

#### SHAP feature importance analysis

In [None]:
## Feature importance analysis using SHAP ####################################
explainer = shap.TreeExplainer(optimised_xgb)
shap_values = explainer.shap_values(Xx)
shap_sum = np.abs(shap_values).mean(axis=0)

## save feature importancce for later plotting
importance = {
    'predictors': selected_feature,
    'SHAP_mean_12': shap_sum/1e6
}
df_importance = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in importance.items()]))
df_importance.to_csv(os.path.join(Path_Output, 'importance_12.csv'), index = None)

#### Sobol sensitivity analysis

In [None]:
sobol_problem = {
    'num_vars': len(selected_feature),
    'names': selected_feature,
    'bounds': [[Xx[name].min(), Xx[name].max()] for name in selected_feature]
}

param_values = saltelli.sample(sobol_problem, 4096)
Param = pd.DataFrame(param_values, columns = selected_feature)
Dtrain_sobol = xgb.DMatrix(Param,  missing=np.nan)
Sobol_Y = optimised_xgb.predict(Dtrain_sobol, iteration_range=(0, optimised_xgb.best_iteration+1))
Si_coeff = sobol.analyze(sobol_problem, Sobol_Y)

## save feature sobol sensitivity for later use
sobol_sensitivity = {
    'predictors': selected_feature,
    'ST': Si_coeff['ST'],
    'S1': Si_coeff['S1']
}
sobol_sensitivity_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in sobol_sensitivity.items()]))
sobol_sensitivity_df.to_csv(os.path.join(Path_Output, 'sobol_sensitivity.csv'), index = None)