In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import optuna
import joblib

In [2]:
stdf = pd.read_csv('datasets/trials.csv')
stdf['r2'] = stdf['value']
stdf = stdf.sort_values(by='r2', ascending=False)
stdf.groupby('params_regressor')['r2'].agg(['mean', 'min', 'count'])

Unnamed: 0_level_0,mean,min,count
params_regressor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GB,0.881395,0.837776,77
RF,0.829496,0.79955,10
XGB,0.865638,0.775072,13


In [3]:
best_trials_per_model = (
    stdf
    .groupby('params_regressor')
    .first()
)
best_trials_per_model

Unnamed: 0_level_0,Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_gb_learning_rate,params_gb_max_depth,params_gb_min_samples_leaf,params_gb_min_samples_split,...,params_xgb_gamma,params_xgb_learning_rate,params_xgb_max_depth,params_xgb_min_child_weight,params_xgb_n_estimators,params_xgb_reg_alpha,params_xgb_reg_lambda,params_xgb_subsample,state,r2
params_regressor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GB,90,90,0.887333,2026-01-24 23:25:59.709373,2026-01-24 23:26:07.382499,0 days 00:00:07.673126,0.04,3.0,2.0,5.0,...,,,,,,,,,COMPLETE,0.887333
RF,2,2,0.857684,2026-01-24 23:20:52.603471,2026-01-24 23:20:54.999458,0 days 00:00:02.395987,,,,,...,,,,,,,,,COMPLETE,0.857684
XGB,5,5,0.880744,2026-01-24 23:20:59.677961,2026-01-24 23:21:00.850733,0 days 00:00:01.172772,,,,,...,0.0,0.08,4.0,3.0,500.0,0.6,5.0,0.6,COMPLETE,0.880744


In [4]:
def extract_params(row, prefix):
    return {
        k.replace(prefix, ''): row[k]
        for k in row.index
        if k.startswith(prefix)
    }

best_xgb_params = extract_params(best_trials_per_model.loc['XGB'], 'params_xgb_')
best_rf_params  = extract_params(best_trials_per_model.loc['RF'],  'params_rf_')
best_gb_params  = extract_params(best_trials_per_model.loc['GB'],  'params_gb_')

In [5]:
def fix_int_params(params, int_keys):
    for k in int_keys:
        if k in params:
            params[k] = int(params[k])
    return params
xgb_int_keys = ['n_estimators', 'max_depth', 'min_child_weight', 'gamma']
best_xgb_params = fix_int_params(best_xgb_params, xgb_int_keys)
rf_int_keys = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']
best_rf_params = fix_int_params(best_rf_params, rf_int_keys)
gb_int_keys = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']
best_gb_params = fix_int_params(best_gb_params, gb_int_keys)

In [6]:
best_xgb = xgb.XGBRegressor(
    **best_xgb_params,
    random_state=42
)

best_rf = RandomForestRegressor(
    **best_rf_params,
    random_state=42
)

best_gb = GradientBoostingRegressor(
    **best_gb_params,
    random_state=42
)

In [8]:
df = pd.read_csv('datasets/final_boston.csv')
X = df.drop('MEDV', axis=1)
y = df['MEDV']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
stacking_model = StackingRegressor(
    estimators=[
        ('xgb', best_xgb),
        ('rf', best_rf),
        ('gb', best_gb)
    ],
    final_estimator=Ridge(),
    cv=5
)
stacking_model.fit(train_X, train_y)
stacking_pred = stacking_model.predict(test_X)

In [12]:
joblib.dump(best_xgb, 'models/best_xgb.pkl')
joblib.dump(best_rf, 'models/best_rf.pkl')
joblib.dump(best_gb, 'models/best_gb.pkl')
joblib.dump(stacking_model, 'models/stacking_model.pkl')

['models/stacking_model.pkl']