In [66]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import RepeatedKFold
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_predict
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import geopandas as gpd
import os
import optuna
import joblib

In [11]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

In [7]:
watershed_attributes_50 = [ 
       'DA_SQKM', 'MAXDI_EROM', 'Dam_Index', 'TOT_ELEV_MEAN', 'TOT_ELEV_MAX',
       'TOT_STREAM_SLOPE', 'TOT_MAXP6190', 'TOT_MAXWD6190', 'TOT_MINWD6190',
       'TOT_RH', 'TOT_AET', 'TOT_CWD', 'TOT_BFI', 'TOT_CONTACT', 'TOT_IEOF',
       'TOT_RECHG', 'TOT_SATOF', 'TOT_TWI', 'TOT_EWT', 'TOT_RF7100',
       'TOT_MIRAD_2012', 'TOT_FRESHWATER_WD', 'TOT_STREAMRIVER',
       'TOT_ARTIFICIAL', 'TOT_CONNECTOR', 'TOT_STRM_DENS',
       'TOT_TOTAL_ROAD_DENS', 'TOT_HGA', 'TOT_HGB', 'TOT_HGC', 'TOT_HGD',
       'TOT_SILTAVE', 'TOT_CLAYAVE', 'TOT_SANDAVE', 'TOT_KFACT',
       'TOT_KFACT_UP', 'TOT_NO10AVE', 'TOT_NO200AVE', 'TOT_OM', 'TOT_ROCKDEP',
       'TOT_BDAVE', 'TOT_WTDEP', 'TOT_SRL25AG', 'TOT_NLCD19_31',
       'TOT_NLCD19_41', 'TOT_NLCD19_43', 'TOT_NLCD19_71', 'TOT_NLCD19_81',
       'TOT_NLCD19_FOREST', 'TOT_NLCD19_WETLAND']

In [8]:
# Data
# Importing spatial info
all_data = pd.read_csv(os.path.join(os.getcwd(), 'Data','WA_50_Geospatial_PUB_Variable_0d_LSTM_Error_metrics.csv'))
reg_data = all_data[all_data['Dam_Index']>=0.1]
unreg_data = all_data[all_data['Dam_Index']<0.1]

WA_df = all_data[watershed_attributes_50]
WA_df=(WA_df-WA_df.min())/(WA_df.max()-WA_df.min()) # normalizing watershed attributes
WA_arr = WA_df.to_numpy()

## hyperparameters optimization

In [31]:
X = all_data[watershed_attributes_50]  
y = all_data['kge_kappa_categories']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

def objective(trial):

    # Define the hyperparameters to be tuned
    n_estimators = trial.suggest_int('n_estimators', 200, 500)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    min_samples_split = trial.suggest_int('min_samples_split', 5, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 100, log=True)
    ccp_alpha = trial.suggest_float('ccp_alpha', 0.00, 0.20)
    
    #class_weight = trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])

    # Create the RandomForestClassifier with the hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        ccp_alpha=ccp_alpha,
        class_weight =  {1: 1, 0: 2} # 1 is Below median and 0 is Above median
    )

   # Evaluate the classifier using stratified cross-validation
    skf = StratifiedKFold(n_splits=10)
    f1_scores = []
    confusion_matrices = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
    
    # Average F1 score
    average_f1_score = np.mean(f1_scores)
    
    return average_f1_score

# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2025-03-27 15:44:43,119] A new study created in memory with name: no-name-362dc042-24aa-4248-bfce-7cc832017eb2
[I 2025-03-27 15:44:48,492] Trial 0 finished with value: 0.5372892621305893 and parameters: {'n_estimators': 348, 'criterion': 'gini', 'min_samples_split': 19, 'min_samples_leaf': 17, 'max_features': 'log2', 'max_leaf_nodes': 19, 'ccp_alpha': 0.011807465354867763}. Best is trial 0 with value: 0.5372892621305893.
[I 2025-03-27 15:44:59,344] Trial 1 finished with value: 0.2928136713042373 and parameters: {'n_estimators': 292, 'criterion': 'log_loss', 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': None, 'max_leaf_nodes': 2, 'ccp_alpha': 0.1547631888554826}. Best is trial 0 with value: 0.5372892621305893.
[I 2025-03-27 15:45:32,912] Trial 2 finished with value: 0.5616910375544502 and parameters: {'n_estimators': 445, 'criterion': 'log_loss', 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': None, 'max_leaf_nodes': 49, 'ccp_alpha': 0.04638580279006

Number of finished trials: 100
Best trial: {'n_estimators': 400, 'criterion': 'gini', 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': None, 'max_leaf_nodes': 31, 'ccp_alpha': 0.01608947548092516}


## Model training

In [47]:
best_params = {'n_estimators': 400, 'criterion': 'gini', 'min_samples_split': 9,
 'min_samples_leaf': 7, 'max_features': None, 'max_leaf_nodes': 31,
 'ccp_alpha': 0.016,
'class_weight': {1: 1, 0: 1.5}} # 1 is Below median and 0 is Above median

# Train the final model with the best hyperparameters
#best_params = study.best_trial.params
SPRF = RandomForestClassifier(
    **best_params,
    random_state=42
)

In [48]:
X = all_data[watershed_attributes_50]  
y = all_data['kge_kappa_categories']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# StratifiedKFold CV
skf = StratifiedKFold(n_splits=10, shuffle=True)
f1_scores = []
confusion_matrices = []

# Perform k-fold cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    SPRF.fit(X_train, y_train)
    y_pred = SPRF.predict(X_test)
        
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))
    

print(f'Cross-Validation F1 Scores: {f1_scores}')
print(f'Average F1 Score: {np.asarray(f1_scores).mean()}')

Cross-Validation F1 Scores: [0.6153846153846154, 0.6407894736842106, 0.6585858585858586, 0.7350543478260869, 0.7246376811594203, 0.8146341463414635, 0.7490829053558328, 0.680672268907563, 0.7368421052631579, 0.7490829053558328]
Average F1 Score: 0.7104766307864041


In [None]:
SPRF.fit(X,y)