In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')



In [2]:
Scenario = 'Scenario_B'
ML_model = 'LightGBM_001'

### Functions

In [6]:
def predict_LGBM(model, X_df, y_df, prediction_col='ETa_predicted'):

    
    preds = model.predict(X_df)
    
    # Create a copy of the DataFrame and add the predictions column.
    df_with_preds = pd.concat([X_df, y_df], axis=1).copy()
    df_with_preds[prediction_col] = preds
    
    return df_with_preds

## Model training

In [7]:
train_file_UC1 = os.path.join(os.path.dirname(os.getcwd()),'6_Data_aggregation',
                             'US_UC1_Hourly_ECT_PSNDVI_ETa_Scenario_B_ECT_train_AORC_test.csv')  # Change to your training file path

train_file_UC2 = os.path.join(os.path.dirname(os.getcwd()),'6_Data_aggregation',
                             'US_UC2_Hourly_ECT_PSNDVI_ETa_Scenario_B_ECT_train_AORC_test.csv')  # Change to your training file path

# Load the data; ensure that the timestamp column is parsed as dates.
UC1_train_df = pd.read_csv(train_file_UC1, index_col=0, parse_dates=[0])
UC2_train_df = pd.read_csv(train_file_UC2, index_col=0, parse_dates=[0])

train_df = pd.concat([UC1_train_df, UC2_train_df])

# Drop observations with nan NDVI values or flag==1 or ET_corr nan values
train_df_filtered = train_df[(train_df['NDVI']>0) & (train_df['ETa_corr']>0) & (train_df['ffp_intersection_flag']==0)]

train_df_filtered = train_df_filtered.rename(columns={'TA_1_1_1': 'Air Temperature', 'RH_1_1_1': 'relative_humidity',
                                                      'SW_IN_1_1_1':'Downward Short-Wave Radiation Flux',
                                                      'LW_IN_1_1_1':'Downward Long-Wave Radiation Flux',
                                                      'WS':'Wind Speed', 'PA':'Air Pressure',
                                                      'P_RAIN_1_1_1':'Total Precipitation'})

# Drop 'Downward Long-Wave Radiation Flux' and 'Total Precipitation' based on HXGB001 results
train_df_filtered = train_df_filtered.drop(columns=['Downward Long-Wave Radiation Flux', 'Total Precipitation', 'Wind Speed', 'Air Pressure'])

# train X and y
X_train_val = train_df_filtered.drop(columns=['ETa_corr', 'ffp_intersection_flag', 'ETa_NaN_flag'])  # Training features
y_train_val = train_df_filtered['ETa_corr']  # Training target

In [8]:
X_train_val

Unnamed: 0,NDVI,Air Temperature,relative_humidity,Downward Short-Wave Radiation Flux
2019-05-20 17:00:00,0.501102,23.43530,41.33385,305.196000
2019-05-20 18:00:00,0.495893,21.37005,49.39870,85.643600
2019-05-21 08:00:00,0.491184,10.86390,53.09805,595.519000
2019-05-21 10:00:00,0.490087,14.01855,44.29245,997.851000
2019-05-21 11:00:00,0.489702,14.89420,42.03250,674.758000
...,...,...,...,...
2022-12-07 13:00:00,0.499595,12.32465,84.76705,342.299500
2022-12-07 14:00:00,0.499336,13.13275,83.41765,186.696000
2022-12-07 16:00:00,0.493651,13.03555,83.54605,6.764885
2022-12-07 17:00:00,0.495269,13.59060,78.00540,-0.341286


In [39]:
def objective(trial):
    # Choose between two objectives: standard regression or huber loss for robustness.
    objective_choice = trial.suggest_categorical('objective', ['regression', 'huber'])
    
    param = {
        'objective': objective_choice,
        'metric': 'mae',
        'learning_rate': round(trial.suggest_float('learning_rate', 0.001, 0.1, log=True),3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'min_data_in_leaf': 25,
        'feature_fraction': round(trial.suggest_float('feature_fraction', 0.1, 1.0),3),
        'bagging_fraction': round(trial.suggest_float('bagging_fraction', 0.1, 1.0),3),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': round(trial.suggest_float('lambda_l1', 0.1, 10, log=True),3),
        'lambda_l2': round(trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),3),
        'feature_pre_filter': False,
        'verbose': -1,
    }

    # add constant params
    #param['feature_contri']= [1.0, 1.0, 1.0, 0.5],
    param['monotone_constraints']        = '1,0,0,0'
    param['monotone_constraints_method'] = 'intermediate'
    
    # Bin target into quantiles for stratification
    y_bins = pd.qcut(y_train_val, q=10, labels=False, duplicates='drop')

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []
    
    for train_idx, valid_idx in skf.split(X_train_val, y_bins):
        X_train_fold, X_valid_fold = X_train_val.iloc[train_idx], X_train_val.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_train_val.iloc[train_idx], y_train_val.iloc[valid_idx]

        lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
        lgb_valid = lgb.Dataset(X_valid_fold, y_valid_fold, reference=lgb_train)

        gbm = lgb.train(
            param,
            lgb_train,
            num_boost_round=2500,
            valid_sets=[lgb_valid]
        )

        preds = gbm.predict(X_valid_fold, num_iteration=gbm.best_iteration)
        fold_mae = mean_absolute_error(y_valid_fold, preds)
        mae_scores.append(fold_mae)

    return np.mean(mae_scores)

# Create and run the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Display the best trial results
print("Best trial:")
print("  MAE: ", study.best_trial.value)
print("  Best hyperparameters: ", study.best_trial.params)

[I 2025-05-22 16:52:13,705] A new study created in memory with name: no-name-de74fdc7-3938-4b3a-a03a-3a68aa81ba96
[I 2025-05-22 16:52:34,674] Trial 0 finished with value: 0.07336801882931257 and parameters: {'objective': 'huber', 'learning_rate': 0.010381454921318124, 'num_leaves': 143, 'max_depth': 15, 'feature_fraction': 0.34843913162274487, 'bagging_fraction': 0.8885037115733644, 'bagging_freq': 8, 'lambda_l1': 0.12425736158500596, 'lambda_l2': 0.11635460337295818}. Best is trial 0 with value: 0.07336801882931257.
[I 2025-05-22 16:52:38,985] Trial 1 finished with value: 0.06664901617623817 and parameters: {'objective': 'regression', 'learning_rate': 0.004733511210336748, 'num_leaves': 141, 'max_depth': 21, 'feature_fraction': 0.4004617434178839, 'bagging_fraction': 0.20352855827798022, 'bagging_freq': 8, 'lambda_l1': 1.3815636944679697, 'lambda_l2': 0.01274831644466275}. Best is trial 1 with value: 0.06664901617623817.
[I 2025-05-22 16:52:42,143] Trial 2 finished with value: 0.07099

Best trial:
  MAE:  0.060882611811213315
  Best hyperparameters:  {'objective': 'huber', 'learning_rate': 0.003149141542114454, 'num_leaves': 84, 'max_depth': 11, 'feature_fraction': 0.9662521014818152, 'bagging_fraction': 0.5658652922842053, 'bagging_freq': 3, 'lambda_l1': 0.14944703407673246, 'lambda_l2': 4.358730279865649}


In [44]:
# Retrieve the best hyperparameters from the study.
best_params = study.best_trial.params
best_params['feature_contri']= [2.0, 0.1, 0.1, 1.0]
best_params['monotone_constraints']= '1,0,0,0'
best_params['monotone_constraints_method']= 'intermediate'

lgb_train = lgb.Dataset(X_train_val, y_train_val)
lgb_val = lgb.Dataset(X_train_val, y_train_val, reference=lgb_train)

# Train the final model.
final_model = lgb.train(
    best_params,
    lgb_train,
    num_boost_round=5000,             
    valid_sets=[lgb_val]
)

clear_output(wait=False)

In [45]:
final_model.save_model('ScenarioB_lgbm_final_model_001.txt')

<lightgbm.basic.Booster at 0x25b2f216ef0>

In [32]:
print("Features in order:", final_model.feature_name())
print("Monotone constraints:", final_model.params.get('monotone_constraints'))

Features in order: ['NDVI', 'Air_Temperature', 'relative_humidity', 'Downward_Short-Wave_Radiation_Flux']
Monotone constraints: 1,0,0,0


In [37]:
final_model.params

{'objective': 'huber',
 'learning_rate': 0.003500853884222846,
 'num_leaves': 120,
 'max_depth': 3,
 'feature_fraction': 0.9603164213952968,
 'bagging_fraction': 0.8605661311481673,
 'bagging_freq': 9,
 'lambda_l1': 2.6818804850722935,
 'lambda_l2': 1.1771272080089938,
 'feature_contri': [1.0, 1.0, 1.0, 0.5],
 'monotone_constraints': '1,0,0,0',
 'monotone_constraints_method': 'intermediate',
 'num_iterations': 2500}

In [674]:
final_model.feature_name()

['NDVI',
 'Air_Temperature',
 'relative_humidity',
 'Downward_Short-Wave_Radiation_Flux']