In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Load and prepare the dataset
data = pd.read_csv('data/DATA.csv', sep=";")
data['T'] = pd.to_datetime(data['T'], dayfirst=True)
data['WL_Pank'] = data['WL_Pank'] * 0.01
data['WL_Sof'] = data['WL_Sof'] * 0.01
data['WL_Bou'] = data['WL_Bou'] * 0.01
data['WL_Dou'] = data['WL_Dou'] * 0.01
data['WL_Mop'] = data['WL_Mop'] * 0.01
data['P'] = data['P'] * 0.001
data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your dataset and prepare it as usual
# Assuming data is already loaded and preprocessed as needed

# Define X and y
X = data[['Tot_evap', 'WL_Sof', 'WL_Bou', 'WL_Pank', 'P', 'WL_Dou', 'WTD']]  # Inputs
y = data['WL_Mop']  # Target

# Split the data: first 60% training, middle 20% testing, last 20% validation
train_size = int(0.6 * len(X))
test_size = int(0.2 * len(X))

# The first 60% for training
X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

# The next 20% for testing
X_test = X.iloc[train_size:train_size + test_size]
y_test = y.iloc[train_size:train_size + test_size]

# The last 20% for validation
X_val = X.iloc[train_size + test_size:]
y_val = y.iloc[train_size + test_size:]

# Scale the data
scaler = StandardScaler()

# Fit the scaler on the training data and transform all datasets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Check the results (optional)
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)
print("Validation set:", X_val.shape, y_val.shape)


# Bayesian optimization Fold 1

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer
import numpy as np
from skopt import BayesSearchCV

# Define the NSE function
def nse(y_true, y_pred):
    return 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))

# Create a custom scorer for Bayesian Optimization
nse_scorer = make_scorer(nse, greater_is_better=True)

# Define the search space for each model
search_spaces = {
    'RandomForest': {
        'n_estimators': (100, 500),           # Number of trees in the forest
        'max_depth': (10, 50),                # Maximum depth of the tree
        'min_samples_split': (2, 10),         # Minimum number of samples required to split an internal node
        'min_samples_leaf': (1, 4)            # Minimum number of samples required to be at a leaf node
    },
    'GradientBoosting': {
        'n_estimators': (100, 500),           # Number of boosting stages
        'learning_rate': (0.01, 0.3, 'log-uniform'),  # Step size shrinkage
        'max_depth': (3, 10),                 # Maximum depth of the tree
        'subsample': (0.6, 1.0)               # Fraction of samples to be used for fitting individual base learners
    },
    'SVR': {
        'C': (0.1, 1000, 'log-uniform'),      # Regularization parameter
        'gamma': (0.0001, 1.0, 'log-uniform'),# Kernel coefficient
        'epsilon': (0.01, 0.1)                # Epsilon in the epsilon-SVR model
    },
    'XGB': {
        'n_estimators': (100, 500),           # Number of boosting stages
        'max_depth': (3, 10),                 # Maximum depth of a tree
        'learning_rate': (0.01, 0.3, 'log-uniform'), # Step size shrinkage
        'subsample': (0.6, 1.0),              # Fraction of samples for training
        'colsample_bytree': (0.6, 1.0)        # Fraction of features used for training each tree
    }
}

# Function to run Bayesian Optimization
def bayesian_optimize(model, search_space, X_train, y_train, n_iter=30):
    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=search_space,
        scoring=nse_scorer,
        n_iter=n_iter,   # Number of iterations for optimization
        cv=3,            # Cross-validation
        n_jobs=-1,       # Use all CPUs
        verbose=0        # Silent, change to 1 for more logging
    )
    bayes_search.fit(X_train, y_train)
    return bayes_search.best_estimator_, bayes_search.best_params_

# Define model instances
models = {
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'XGB': XGBRegressor()
}

# Perform Bayesian Optimization for each model
optimized_models = {}
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")
    best_model, best_params = bayesian_optimize(
        model, search_spaces[model_name], X_train_scaled, y_train
    )
    optimized_models[model_name] = {
        'Model': best_model,
        'Best Params': best_params
    }

# Print the best parameters for each model
for model_name, model_info in optimized_models.items():
    print(f"\nBest parameters for {model_name}: {model_info['Best Params']}")

# Function to train and evaluate the models with NSE
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = nse(y_test, y_pred)
    return model, score

# Evaluate the optimized models
evaluation_results = {}
for model_name, model_info in optimized_models.items():
    trained_model, nse_score = train_and_evaluate(model_info['Model'], X_train_scaled, y_train, X_test_scaled, y_test)
    evaluation_results[model_name] = {
        'NSE Score': nse_score
    }

# Print the evaluation results
for model_name, result in evaluation_results.items():
    print(f"{model_name} - NSE Score: {result['NSE Score']:.4f}")


# Bayesian optimization Fold 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer
import numpy as np
from skopt import BayesSearchCV

# Define the NSE function
def nse(y_true, y_pred):
    return 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))

# Create a custom scorer for Bayesian Optimization
nse_scorer = make_scorer(nse, greater_is_better=True)

# Split the data: first 20% testing, middle 60% training, last 20% validation
train_size = int(0.6 * len(X))
test_size = int(0.2 * len(X))

# The first 20% for testing
X_test = X.iloc[:test_size]
y_test = y.iloc[:test_size]

# The middle 60% for training
X_train = X.iloc[test_size:test_size + train_size]
y_train = y.iloc[test_size:test_size + train_size]

# The last 20% for validation
X_val = X.iloc[test_size + train_size:]
y_val = y.iloc[test_size + train_size:]

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Define the search space for each model
search_spaces = {
    'RandomForest': {
        'n_estimators': (100, 500),
        'max_depth': (10, 50),
        'min_samples_split': (2, 10),
        'min_samples_leaf': (1, 4)
    },
    'GradientBoosting': {
        'n_estimators': (100, 500),
        'learning_rate': (0.01, 0.3, 'log-uniform'),
        'max_depth': (3, 10),
        'subsample': (0.6, 1.0)
    },
    'SVR': {
        'C': (0.1, 1000, 'log-uniform'),
        'gamma': (0.0001, 1.0, 'log-uniform'),
        'epsilon': (0.01, 0.1)
    },
    'XGB': {
        'n_estimators': (100, 500),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3, 'log-uniform'),
        'subsample': (0.6, 1.0),
        'colsample_bytree': (0.6, 1.0)
    }
}

# Function to run Bayesian Optimization
def bayesian_optimize(model, search_space, X_train, y_train, n_iter=30):
    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=search_space,
        scoring=nse_scorer,
        n_iter=n_iter,
        cv=3,  # Cross-validation
        n_jobs=-1,  # Use all CPUs
        verbose=0  # Silent, change to 1 for more logging
    )
    bayes_search.fit(X_train, y_train)
    return bayes_search.best_estimator_, bayes_search.best_params_

# Define model instances
models = {
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'XGB': XGBRegressor()
}

# Perform Bayesian Optimization for each model
optimized_models = {}
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")
    best_model, best_params = bayesian_optimize(model, search_spaces[model_name], X_train_scaled, y_train)
    optimized_models[model_name] = {'Model': best_model, 'Best Params': best_params}

# Print the best parameters for each model
for model_name, model_info in optimized_models.items():
    print(f"\nBest parameters for {model_name}: {model_info['Best Params']}")
