In [3]:
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('../data/final/final_dataset_encoded.csv')

In [5]:
feature_columns = ['Geolocation_Encoded', 'Commodity_Encoded', 'Commodity_Type_Encoded', "Year", 
                   "Month", "Quarter", "Price_Lag_1", "Price_MA_3", "Price_MA_6", "Price_Std_3", 
                   "CPI_Lag_1", "CPI", "Production_Volume_by_MetricTons", "Area_Harvested_in_Hectares"]

target_column = 'Price'

df = df.dropna()

X = df[feature_columns]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

X_train.to_csv('../data/training/X_train_final_encoded.csv', index=False)
X_test.to_csv('../data/training/X_test_final_encoded.csv', index=False)
y_train.to_csv('../data/training/y_train_final_encoded.csv', index=False)
y_test.to_csv('../data/training/y_test_final_encoded.csv', index=False)

In [6]:
X_train = pd.read_csv("../data/training/X_train_final_encoded.csv")
X_test = pd.read_csv("../data/training/X_test_final_encoded.csv")
y_train = pd.read_csv("../data/training/y_train_final_encoded.csv")
y_test = pd.read_csv("../data/training/y_test_final_encoded.csv")

In [7]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "objective": "reg:squarederror",
        "random_state": 42
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5  # Compute RMSE

    return rmse

In [8]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=600)

[I 2025-03-12 22:56:12,723] A new study created in memory with name: no-name-dab8f318-a00b-4bd6-8f2a-0edcbacbef87
[I 2025-03-12 22:56:19,207] Trial 0 finished with value: 1.233210098493384 and parameters: {'n_estimators': 360, 'learning_rate': 0.1553254816976758, 'max_depth': 4, 'subsample': 0.9976487815817167, 'colsample_bytree': 0.8717058363073897}. Best is trial 0 with value: 1.233210098493384.
[I 2025-03-12 22:56:21,797] Trial 1 finished with value: 0.8454260603519307 and parameters: {'n_estimators': 168, 'learning_rate': 0.27148840594973866, 'max_depth': 4, 'subsample': 0.745836211114847, 'colsample_bytree': 0.9722106394602048}. Best is trial 1 with value: 0.8454260603519307.
[I 2025-03-12 22:56:23,788] Trial 2 finished with value: 0.7075994747849969 and parameters: {'n_estimators': 110, 'learning_rate': 0.08206626564358774, 'max_depth': 5, 'subsample': 0.9575389871351057, 'colsample_bytree': 0.9646765569948352}. Best is trial 2 with value: 0.7075994747849969.
[I 2025-03-12 22:56:

In [9]:
best_params = study.best_params
print("✅ Best Hyperparameters Found:", best_params)

✅ Best Hyperparameters Found: {'n_estimators': 199, 'learning_rate': 0.04224433314343556, 'max_depth': 7, 'subsample': 0.8961187836852313, 'colsample_bytree': 0.9252608184037984}


In [10]:
best_xgb_model = xgb.XGBRegressor(**best_params, objective="reg:squarederror", random_state=42)
best_xgb_model.fit(X_train, y_train)

In [11]:
y_pred_best = best_xgb_model.predict(X_test)
rmse_best = mean_squared_error(y_test, y_pred_best) ** 0.5  # Compute RMSE

print(f"✅ Optimized XGBoost Model Trained with Bayesian Optimization - RMSE: {rmse_best:.4f}")

✅ Optimized XGBoost Model Trained with Bayesian Optimization - RMSE: 0.6668
