# Notes
<br>
<p style="font-size: 1.2em">
Introductory notes</p>
<hr style="height:0.3vw">

<hr><br> 
<p style="font-size: 1.1em; line-height: 2em">
    This notebook hosts the code that was /can be used to tune, train, and test the <br>
    Random Forest (RF) model to estimate daily evapotranspiration.
</p>

<p style="font-size: 1.1em; line-height: 2vw">
    The codes included in this notebook can perform the following tasks:
    <ul style="font-size: 1.1em; line-height: 2em">
        <li>Tune the RF regression model using data from 20 FLUXNET sites.</li>
        <li>Test the RF model using data from 10 FLUXNET sites</li>
        <li>Use the RF model to estimate daily evapotranspiration for the study location</li>
    </ul>
    
</p>

<br>
<p style="font-size: 1.1em; line-height: 2em">
Note that every code cell of this notebook is self-sufficient. That is, one only needs to run the cell to get the expected output.
</p>

In [None]:
# remaining needed modifications:
# - add the title and info of the manuscript, when finilized, to this notebook.

# Tune the RF model
<br>
<p style="font-size: 1.2em">
    Hyperparameter tuning for the RF model
</p>
<hr style="height:0.3vw">
<a id='another_cell'></a>

In [None]:
# <<< imports >>> -----------------------------------------------------------
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


from helper_functions import nmae, calc_nmae, create_spatial_cv_folds
from helper_functions import cvresults_to_df, get_optimal_set
# ___________________________________________________________________ <<< >>>

# <<< variable definition >>> -----------------------------------------------

# 1) path to training data; 2) path to save the result of grid search
path_training_data = Path("./data/RF_trainingData_20FLUXNETsites.csv")
path_save_search = Path("./Gridsearch_results.csv")

# 1) number of CPU core to use; 2) the random seed
ncore = 4
rseed = 1915

# the search space for the tuning process
dict_rf_search = {
    #  there are 6 features
    "max_features": [2, 3, 4, 5, 6],
    "max_samples": [0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    "min_samples_leaf": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    "n_estimators": [100, 250, 500, 1000, 1500, 2000, 4000, 10000]
}
# the above dict holds the search space that we used in the manuscript.
# Note: large values for 'n_estimators' make the processes very time-consuming.
# As a general note, for large dataset, 100 is good enough for 'n_estimators'
# refer to this article:
# >>>> "To Tune or Not to Tune the Number of Trees in Random Forest"

# metric to use in the grid search process
dict_metric = {"NMAE": nmae(calc_nmae)}

# column-labels, in the training data, corresponding to the input variables
list_inpvar_names = [
    'Air temperature (°C)', 'Shortwave radiation (W/m2)',
    'Longwave radiation (W/m2)', 'Atmospheric pressure(kPa)',
    '2-m wind speed (m/s)', 'Specific humidity (kg/kg)'
]

# column-label for the target variable: daily ET
target_var = "Evapotranspiration (mm)"

# info for the cross-validation
# 1) nr of resamples 2) nr of analysis site for each resamples
# 3) column-label for the variable used for spatially separating the training
#    data into analysis and assessment splits
nr_resamples = 10
nr_analysis = 15
location_column = "FLUXNET site ID"
# ___________________________________________________________________ <<< >>>

# <<< main >>> --------------------------------------------------------------

# read training data
df_train_data = pd.read_csv(path_training_data, index_col=False)

# create an instant of random forest regression; 
model_rf = RandomForestRegressor(n_jobs=ncore, random_state=rseed)

# create resamples for the sake of cross-validation
# here, we use a function that performs spatial separation on the training data
# note that one can simply pass an integer to the `cv` parameter of
# `GridSearchSV`. Read its documentation for extra details.
list_cv_resamples = create_spatial_cv_folds(
    file_path=path_training_data, nresamples=nr_resamples,
    loc_column=location_column, n_analysis=nr_analysis, rseed=rseed
)

# create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=model_rf, param_grid=dict_rf_search,
    cv=list_cv_resamples, return_train_score=True, n_jobs=ncore, verbose=1,
    scoring=dict_metric, refit=False
)

grid_search.fit(
    X=df_train_data.loc[:, list_inpvar_names].to_numpy(),
    y=df_train_data.loc[:, target_var].to_numpy()
)

results_search = cvresults_to_df(grid_search.cv_results_, list(dict_metric)[0])
results_search.to_csv(path_save_search, index=False)
# ___________________________________________________________________ <<< >>>

# <<< output >>> ------------------------------------------------------------

# print and get the optimal values for the tuning h parameters
dict_best_params = get_optimal_set(results_search)
# ___________________________________________________________________ <<< >>>

# Test the RF model
<br>
<p style="font-size: 1.2em">
    Train the model using training data and test it on the 10 test sites.
</p>
<hr style="height:0.3vw">

In [8]:
# <<< imports >>> -----------------------------------------------------------
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor

from helper_functions import calc_nmae, calc_nmbe
# ___________________________________________________________________ <<< >>>

# <<< variable definition >>> -----------------------------------------------

# 1) path to training data; 2) path to the test data
path_training_data = Path("./data/RF_trainingData_20FLUXNETsites.csv")
path_test_data = Path("./data/RF_testData_10FLUXNETsites.csv")

# 1) number of CPU core to use; 2) the random seed
ncore = 4
rseed = 1915

# values for the RF hyperparameters
dict_rf_params = {
    "max_features": 5,
    "max_samples": 0.1,
    "min_samples_leaf": 7,
    "n_estimators": 250
}
# the above dict holds the values that we used in the manuscript.

# column-labels, in the training data, corresponding to the input variables
list_inpvar_names = [
    'Air temperature (°C)', 'Shortwave radiation (W/m2)',
    'Longwave radiation (W/m2)', 'Atmospheric pressure(kPa)',
    '2-m wind speed (m/s)', 'Specific humidity (kg/kg)'
]

# column-label for the target variable: daily ET
target_var = "Evapotranspiration (mm)"

# name of the new column created in test dataframe, holding estimates from RF
estinated_var = "Estimated Evapotranspiration (mm)"
# ___________________________________________________________________ <<< >>>

# <<< main >>> --------------------------------------------------------------

# read training and test
df_train_data = pd.read_csv(path_training_data, index_col=False)
df_test_data = pd.read_csv(path_test_data, index_col=False)


# create an instant of random forest regression; 
model_rf = RandomForestRegressor(n_jobs=ncore, random_state=rseed)
model_rf.set_params(**dict_rf_params)

# train the model using the training data
model_rf.fit(
    X=df_train_data.loc[:, list_inpvar_names].to_numpy(),
    y=df_train_data.loc[:, target_var].to_numpy()
)

# model inference on test data
estimated_ytest = model_rf.predict(
    X=df_test_data.loc[:, list_inpvar_names].to_numpy()
)

# add a column in test data for the estiamted values
df_test_data[estinated_var] = estimated_ytest

# calculate normalized mean-bias-error (NMBE) with respecto each FLUXNET site
df_nmbe = pd.DataFrame(
    df_test_data.groupby("FLUXNET site ID").apply(
    lambda df: calc_nmbe(df[target_var], df[estinated_var])
    ), columns=["NMBE (%)"] 
)

# calculate normalized mean-absolute-error (NMAE) with respecto each FLUXNET 
# site
df_nmae = pd.DataFrame(
    df_test_data.groupby("FLUXNET site ID").apply(
    lambda df: calc_nmae(df[target_var], df[estinated_var])
    ), columns=["NMAE (%)"] 
)

#merge them into one dataframe
df_metrics = pd.merge(df_nmae, df_nmbe, on="FLUXNET site ID")
# ___________________________________________________________________ <<< >>>

# <<< output >>> ------------------------------------------------------------

# u can save the results contained in `df_metrics` if you wish
# uncomment the below line
# df_metrics.to_csv("provide_a_path_here.csv", index=False)


print("The below dataframe shows the RF accuracy and bias on the test sites:\n")
df_metrics
# ___________________________________________________________________ <<< >>>

The below dataframe shows the RF accuracy and bias on the test sites:



Unnamed: 0_level_0,NMAE (%),NMBE (%)
FLUXNET site ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AU-Rig,35.723203,2.080363
AU-TTE,135.56084,98.365628
CN-Cng,39.940495,-29.972796
CN-Du3,37.374167,-8.297025
CN-HaM,19.622291,0.170366
DK-Eng,30.048252,13.717465
IT-MBo,34.986294,20.797245
IT-Tor,30.109898,-7.215461
RU-Ha1,24.782166,-14.472003
US-Wkg,58.293299,34.326241


# Estimate daily ET for the study site
<br>
<p style="font-size: 1.2em; line-height: 2em">
    Train the RF model using all the train+test sites and use it to estimate the daily <br>
    evapotranspiration for the study site.
</p>
<hr style="height:0.3vw">

In [3]:
# <<< imports >>> -----------------------------------------------------------
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
# ___________________________________________________________________ <<< >>>

# <<< variable definition >>> -----------------------------------------------

# 1) path to training data; 2) path to the test data  -- FLUXNET data
path_training_data = Path("./data/RF_trainingData_20FLUXNETsites.csv")
path_test_data = Path("./data/RF_testData_10FLUXNETsites.csv")

# 3) path to the meteo data of the study site
path_study_site_data = Path("./data/Daily_MeteoData.csv")

# 1) number of CPU core to use; 2) the random seed
ncore = 4
rseed = 1915

# values for the RF hyperparameters
dict_rf_params = {
    "max_features": 5,
    "max_samples": 0.1,
    "min_samples_leaf": 7,
    "n_estimators": 250
}
# the above dict holds the values that we used in the manuscript.

# column-labels, in the FLUXNET data, corresponding to the input variables
list_inpvar_names = [
    'Air temperature (°C)', 'Shortwave radiation (W/m2)',
    'Longwave radiation (W/m2)', 'Atmospheric pressure(kPa)',
    '2-m wind speed (m/s)', 'Specific humidity (kg/kg)'
]

# column-label for the target variable: daily ET for FLUXNET data
target_var = "Evapotranspiration (mm)"


# column-labels in the meteo data of the study site, corresponding to the
# input variables. !!! MUST be in the same order as `list_inpvar_names` !!!
list_inpvar_study_data = [
    'Air temperature (°C)', 'Shortwave radiation (W.m-2)',
    'Longwave radiation (W.m-2)', 'Atmospheric pressure (kPa)', 
    '2m Wind speed (m.s-1)', 'Specific humidity (kg.kg-1)'
] 

# name of the new column created in study site dataframe
estinated_var = "Estimated Evapotranspiration (mm)"
# ___________________________________________________________________ <<< >>>

# <<< main >>> --------------------------------------------------------------

# read FLUXNET dataframes
df_train_data = pd.read_csv(path_training_data, index_col=False)
df_test_data = pd.read_csv(path_test_data, index_col=False)

# read daily meteo data for the study site 
# create a column for atm pressure in kPa
study_site_data = pd.read_csv(path_study_site_data, index_col=False)
study_site_data['Atmospheric pressure (kPa)'] = (
    study_site_data["Atmospheric pressure (Pa)"] * 0.001
)

# keep all FLUXNET data on one dataframe
fluxnet_data = pd.concat([df_train_data, df_test_data], axis=0)

# create an instant of random forest regression; 
model_rf = RandomForestRegressor(n_jobs=ncore, random_state=rseed)
model_rf.set_params(**dict_rf_params)

# train the model using the training data
model_rf.fit(
    X=fluxnet_data.loc[:, list_inpvar_names].to_numpy(),
    y=fluxnet_data.loc[:, target_var].to_numpy()
);

# model inference on study site data
estimated_ytest = model_rf.predict(
    X=study_site_data.loc[:, list_inpvar_study_data].to_numpy()
)

# add a column in test data for the estiamted values
study_site_data[estinated_var] = estimated_ytest
# ___________________________________________________________________ <<< >>>

# <<< output >>> ------------------------------------------------------------

study_site_data.head()
# ___________________________________________________________________ <<< >>>


Unnamed: 0,Date,Air temperature (°C),Wind speed (m.s-1),Atmospheric pressure (Pa),Shortwave radiation (W.m-2),Longwave radiation (W.m-2),Specific humidity (kg.kg-1),Relative humidity (%),Precipitation (mm),2m Wind speed (m.s-1),Snow depth (cm),Field snow depth (cm),Snowfall probability,"Snowfall (liquid equivalent, mm)",Atmospheric pressure (kPa),Estimated Evapotranspiration (mm)
0,2017-07-01,21.241667,2.789352,100050.0,81.601146,394.85552,0.013957,88.541667,5.2,1.77433,,,7.799863e-13,0.0,100.05,1.293086
1,2017-07-02,20.35,5.497685,100028.333333,243.298714,382.532636,0.011331,76.166667,0.0,3.497123,,,8.352168e-12,0.0,100.028333,3.130661
2,2017-07-03,19.220833,4.74537,100183.333333,240.756058,361.198681,0.009742,71.375,0.0,3.018569,,,6.317421e-11,0.0,100.183333,2.962721
3,2017-07-04,18.629167,2.025463,100797.083333,282.855418,340.409554,0.008365,66.583333,0.0,1.288414,,,2.23944e-10,0.0,100.797083,2.892251
4,2017-07-05,19.15,1.921296,101141.25,344.433832,328.545793,0.008434,66.25,0.0,1.222153,,,1.107218e-10,0.0,101.14125,2.7537
