In [33]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from catboost import Pool
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    median_absolute_error,
    r2_score,
    explained_variance_score)

# Models
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from scikeras.wrappers import KerasRegressor

In [35]:
df = pd.read_csv("../Dataset/PVT_1225.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # 🔀 Shuffle rows

# Preview columns
print(df.columns.tolist())

# Define features and target
X = df.drop(columns=["Pb", "Bob"])
y = df["Pb"]

['Tf', 'Rs', 'gg', 'api ', 'Pb', 'Bob']


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Training samples: 857
Testing samples: 368


# XGBoost

In [43]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from tqdm import tqdm
import time
import numpy as np

# Tuning XGBoost
param_grid = [
    {"n_estimators": n, "learning_rate": lr, "max_depth": d}
    for n in [100, 200]
    for lr in [0.01, 0.05, 0.1]
    for d in [4, 6, 8]
]

best_xgb_params, best_xgb_score = None, -float("inf")
best_xgb_mse = best_xgb_rmse = best_xgb_mae = best_xgb_r2 = best_xgb_adj_r2 = best_xgb_time = None

with tqdm(total=len(param_grid), desc="Tuning XGBoost") as pbar:
    for params in param_grid:
        start_time = time.time()
        xgb_model = XGBRegressor(**params, random_state=42)
        xgb_model.fit(X_train, y_train)
        y_pred_xgb = xgb_model.predict(X_test)
        end_time = time.time()

        # Metrics
        mse = mean_squared_error(y_test, y_pred_xgb)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred_xgb)
        r2 = r2_score(y_test, y_pred_xgb)
        elapsed = end_time - start_time

        # Adjusted R²
        n, p = len(y_test), X_test.shape[1]
        adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1)) if n > p + 1 else None

        # Weighted Score (penalize RMSE, reward R²)
        score = -rmse + (r2 * 100)

        if score > best_xgb_score:
            best_xgb_params = params
            best_xgb_score = score
            best_xgb_mse = mse
            best_xgb_rmse = rmse
            best_xgb_mae = mae
            best_xgb_r2 = r2
            best_xgb_adj_r2 = adj_r2
            best_xgb_time = elapsed

        pbar.update(1)

# Print
print(f"\nBest XGBoost Params: {best_xgb_params}, Best Score: {best_xgb_score:.4f}, "
      f"Best MSE: {best_xgb_mse:.4f}, Best RMSE: {best_xgb_rmse:.4f}, Best MAE: {best_xgb_mae:.4f}, "
      f"Best R²: {best_xgb_r2:.4f}, "
      f"{f'Best Adjusted R²: {best_xgb_adj_r2:.4f}' if best_xgb_adj_r2 is not None else 'Best Adjusted R²: N/A'}, "
      f"XGBoost Training Time: {best_xgb_time:.4f}s")


Tuning XGBoost: 100%|███████████████████████████| 18/18 [00:01<00:00, 17.35it/s]


Best XGBoost Params: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 4}, Best Score: -236.4660, Best MSE: 108293.4906, Best RMSE: 329.0798, Best MAE: 178.8929, Best R²: 0.9261, Best Adjusted R²: 0.9253, XGBoost Training Time: 0.0324s





# CatBoost

In [45]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from tqdm import tqdm
import time
import numpy as np

# Tuning CatBoost
param_grid = [
    {"iterations": i, "learning_rate": lr, "depth": d, "l2_leaf_reg": reg}
    for i in [500, 1000]
    for lr in [0.01, 0.05, 0.1]
    for d in [4, 6]
    for reg in [3, 5, 7]
]

best_cat_params, best_cat_score = None, -float("inf")
best_cat_mse = best_cat_rmse = best_cat_mae = best_cat_r2 = best_cat_adj_r2 = best_cat_time = None

with tqdm(total=len(param_grid), desc="Tuning CatBoost") as pbar:
    for params in param_grid:
        start_time = time.time()
        cat_model = CatBoostRegressor(verbose=0, random_state=42, **params)
        cat_model.fit(X_train, y_train)
        y_pred_cat = cat_model.predict(X_test)
        end_time = time.time()

        # Metrics
        mse = mean_squared_error(y_test, y_pred_cat)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred_cat)
        r2 = r2_score(y_test, y_pred_cat)
        elapsed = end_time - start_time

        # Adjusted R²
        n, p = len(y_test), X_test.shape[1]
        adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1)) if n > p + 1 else None

        # Weighted Score (penalize RMSE, reward R²)
        score = -rmse + (r2 * 100)

        if score > best_cat_score:
            best_cat_params = params
            best_cat_score = score
            best_cat_mse = mse
            best_cat_rmse = rmse
            best_cat_mae = mae
            best_cat_r2 = r2
            best_cat_adj_r2 = adj_r2
            best_cat_time = elapsed

        pbar.update(1)

# Print
print(f"\nBest CatBoost Params: {best_cat_params}, Best Score: {best_cat_score:.4f}, "
      f"Best MSE: {best_cat_mse:.4f}, Best RMSE: {best_cat_rmse:.4f}, Best MAE: {best_cat_mae:.4f}, "
      f"Best R²: {best_cat_r2:.4f}, "
      f"{f'Best Adjusted R²: {best_cat_adj_r2:.4f}' if best_cat_adj_r2 is not None else 'Best Adjusted R²: N/A'}, "
      f"CatBoost Training Time: {best_cat_time:.4f}s")


Tuning CatBoost: 100%|██████████████████████████| 36/36 [00:07<00:00,  5.09it/s]


Best CatBoost Params: {'iterations': 1000, 'learning_rate': 0.05, 'depth': 4, 'l2_leaf_reg': 7}, Best Score: -213.2137, Best MSE: 94122.5616, Best RMSE: 306.7940, Best MAE: 177.0382, Best R²: 0.9358, Best Adjusted R²: 0.9351, CatBoost Training Time: 0.1860s





# Neural Network

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
import time
import numpy as np

# Define grid
param_grid = [
    {"layers": [64, 32], "epochs": 100, "batch_size": 16},
    {"layers": [128, 64], "epochs": 100, "batch_size": 16},
    {"layers": [128, 64, 32], "epochs": 150, "batch_size": 32},
]

# Best tracking variables
best_nn_params, best_nn_score = None, -float("inf")
best_nn_mse = best_nn_rmse = best_nn_mae = best_nn_r2 = best_nn_adj_r2 = best_nn_time = None

# Tuning loop
with tqdm(total=len(param_grid), desc="Tuning Neural Network") as pbar:
    for params in param_grid:

        def create_nn():
            model = Sequential()
            model.add(Input(shape=(X_train.shape[1],)))
            for units in params["layers"]:
                model.add(Dense(units, activation='relu'))
            model.add(Dense(1))
            model.compile(optimizer=Adam(), loss='mse')
            return model

        start_time = time.time()
        nn_model = KerasRegressor(model=create_nn, epochs=params["epochs"], batch_size=params["batch_size"], verbose=0)
        nn_model.fit(X_train, y_train)
        y_pred_nn = nn_model.predict(X_test)
        end_time = time.time()

        nn_mse = mean_squared_error(y_test, y_pred_nn)
        nn_rmse = np.sqrt(nn_mse)
        nn_mae = mean_absolute_error(y_test, y_pred_nn)
        nn_r2 = r2_score(y_test, y_pred_nn)
        n = len(y_test)
        p = X_test.shape[1]
        nn_adj_r2 = 1 - ((1 - nn_r2) * (n - 1) / (n - p - 1)) if n > p + 1 else None
        nn_time = end_time - start_time

        # Weighted score: reward high R², penalize RMSE
        nn_score = -nn_rmse + (nn_r2 * 100)

        if nn_score > best_nn_score:
            best_nn_params = params
            best_nn_score = nn_score
            best_nn_mse = nn_mse
            best_nn_rmse = nn_rmse
            best_nn_mae = nn_mae
            best_nn_r2 = nn_r2
            best_nn_adj_r2 = nn_adj_r2
            best_nn_time = nn_time

        pbar.update(1)

# Output
print(f"\nBest Neural Network Params: {best_nn_params}, Best Score: {best_nn_score:.4f}, Best MSE: {best_nn_mse:.4f}, "
      f"Best RMSE: {best_nn_rmse:.4f}, Best MAE: {best_nn_mae:.4f}, Best R²: {best_nn_r2:.4f}, "
      f"{f'Best Adjusted R²: {best_nn_adj_r2:.4f}' if best_nn_adj_r2 is not None else 'Best Adjusted R²: N/A'}, "
      f"Neural Net Training Time: {best_nn_time:.4f}s")


Tuning Neural Network: 100%|██████████████████████| 3/3 [00:07<00:00,  2.63s/it]


Best Neural Network Params: {'layers': [128, 64, 32], 'epochs': 150, 'batch_size': 32}, Best Score: -428.7960, Best MSE: 261107.5314, Best RMSE: 510.9868, Best MAE: 346.9311, Best R²: 0.8219, Best Adjusted R²: 0.8199, Neural Net Training Time: 2.7847s





# Stacking Ensemble

In [57]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define base learners with default/approximate params
base_estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge()),
    ('lasso', Lasso()),
    ('dt', DecisionTreeRegressor(max_depth=10)),
    ('knn', KNeighborsRegressor(n_neighbors=5)),
    ('svr', SVR()),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42))
]

# Define final estimator
final_estimator = CatBoostRegressor(verbose=0, random_state=42)

# Create the stacking model
stack = StackingRegressor(
    estimators=base_estimators,
    final_estimator=final_estimator,
    passthrough=True
)

# Fit and evaluate
start_time = time.time()
stack.fit(X_train, y_train)
stack_preds = stack.predict(X_test)
end_time = time.time()

# Calculate metrics
stack_mse = mean_squared_error(y_test, stack_preds)
stack_rmse = np.sqrt(stack_mse)
stack_mae = mean_absolute_error(y_test, stack_preds)
stack_r2 = r2_score(y_test, stack_preds)
n, p = len(y_test), X_test.shape[1]
stack_adj_r2 = 1 - ((1 - stack_r2) * (n - 1) / (n - p - 1))
stack_score = -stack_rmse + (stack_r2 * 100)
stack_time = end_time - start_time

# Print results
print(f"\nBest Stacked Ensemble Score: {stack_score:.4f}, Best MSE: {stack_mse:.4f}, "
      f"Best RMSE: {stack_rmse:.4f}, Best MAE: {stack_mae:.4f}, Best R²: {stack_r2:.4f}, "
      f"Best Adjusted R²: {stack_adj_r2:.4f}, Stacked Training Time: {stack_time:.4f}s")



Best Stacked Ensemble Score: -219.2226, Best MSE: 97693.3175, Best RMSE: 312.5593, Best MAE: 175.4777, Best R²: 0.9334, Best Adjusted R²: 0.9326, Stacked Training Time: 0.7827s
