In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression, Lasso, BayesianRidge, ElasticNet
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from preprocessing import make_preprocessor
import warnings
import optuna
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
warnings.filterwarnings(action="ignore")

In [3]:
df = pd.read_csv("data/train.csv")
df['LogSalePrice'] = np.log1p(df['SalePrice'])

In [4]:
y = df['LogSalePrice'].to_numpy()
X = df.drop(columns=['SalePrice', 'LogSalePrice', 'Id'])

In [5]:
X_train, X_valid, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = make_preprocessor(X_train, drop_engineered_cols=True)
X_train = preprocessor.fit_transform(X_train)
feature_names = preprocessor.named_steps["encode"].get_feature_names_out()
X_valid = preprocessor.transform(X_valid)

print(X_train.shape)

(1168, 166)


## Modelling

In [6]:
def rmse(model):
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    print('-'*25)
    print(f'RMSE: {rmse:.4f}')
    print('-'*25)

### Linear Regression

In [9]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

rmse(linear_model)

-------------------------
RMSE: 1325328724.1656
-------------------------


### Ridge Regression

In [8]:
ridge_model = Ridge(alpha=2)
ridge_model.fit(X_train, y_train)

rmse(ridge_model)

-------------------------
RMSE: 0.1392
-------------------------


### XGBoost

In [10]:
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.6,
    colsample_bytree=0.5,
    random_state=42
)

xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_val)],
    verbose=False
)

rmse(xgb_model)

-------------------------
RMSE: 0.1304
-------------------------


### Hyperparameter Tuning for XGBoost (Randomized Search CV)

In [15]:
xgb_params = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 12],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.5, 0.6, 0.7],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

xgb = XGBRegressor(random_state=42, n_jobs=-1)

search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_params,
    n_iter=40, 
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

best_xgb = search.best_estimator_
print(f"Best params: {search.best_params_}")
rmse(best_xgb)


Fitting 3 folds for each of 40 candidates, totalling 120 fits
Best params: {'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
-------------------------
RMSE: 0.1307
-------------------------


### Hyperparameter Tuning for XGBoost (Optuna)

In [19]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = XGBRegressor(**params)
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    return -scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) 

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)

best_xgb_optuna = XGBRegressor(**study.best_params)
best_xgb_optuna.fit(X_train, y_train)

[I 2026-01-17 18:40:17,755] A new study created in memory with name: no-name-ca96bb4e-2ca7-4898-819f-a15db75e0178
[I 2026-01-17 18:40:19,255] Trial 0 finished with value: 0.1330836326432116 and parameters: {'n_estimators': 1328, 'learning_rate': 0.06115110742942363, 'max_depth': 4, 'min_child_weight': 4, 'gamma': 0.13609383705571892, 'subsample': 0.689785097364144, 'colsample_bytree': 0.7021784976132466, 'reg_alpha': 0.03951240086559421, 'reg_lambda': 0.16521377751914576}. Best is trial 0 with value: 0.1330836326432116.
[I 2026-01-17 18:40:22,682] Trial 1 finished with value: 0.13126038735927853 and parameters: {'n_estimators': 2380, 'learning_rate': 0.010590202282172207, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0.109056239994477, 'subsample': 0.6504863249156063, 'colsample_bytree': 0.7264850971988569, 'reg_alpha': 0.04407747860507449, 'reg_lambda': 8.345776667485923}. Best is trial 1 with value: 0.13126038735927853.
[I 2026-01-17 18:40:26,070] Trial 2 finished with value: 0.140

Best RMSE: 0.12400806677333871
Best params: {'n_estimators': 1704, 'learning_rate': 0.013307975513103329, 'max_depth': 5, 'min_child_weight': 1, 'gamma': 0.007106633317662602, 'subsample': 0.7061446062898165, 'colsample_bytree': 0.5760914933167709, 'reg_alpha': 0.03181356427048448, 'reg_lambda': 0.8460490483048156}


In [20]:
rmse(best_xgb_optuna)

-------------------------
RMSE: 0.1361
-------------------------


### Ensemble Learning

In [24]:
# Best XBoost (from previous tuning)
xgb_best = best_xgb  

# LGBM 
lgbm = LGBMRegressor(
    objective='regression', 
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

# SVR (Support Vector Regression) - crucial for diversity
svr = SVR(C=20, epsilon=0.008, gamma=0.0003)

# Gradient Boosting 
gbr = GradientBoostingRegressor(
    n_estimators=3000, 
    learning_rate=0.05, 
    max_depth=4, 
    max_features='sqrt', 
    min_samples_leaf=15, 
    min_samples_split=10, 
    loss='huber', 
    random_state=42
)

# Linear models with different regularizations
ridge = Ridge(alpha=13) 
lasso = Lasso(alpha=0.0005, random_state=42)

estimators_diverse = [
    ('xgb', xgb_best),
    ('lgbm', lgbm),
    ('svr', svr),
    ('gbr', gbr),
    ('ridge', ridge),
    ('lasso', lasso)
]

# Bayesian Ridge automaticly tunes regularization parameters,
final_estimator_advanced = BayesianRidge()

# Stacking Regressor
stacking_regressor_v2 = StackingRegressor(
    estimators=estimators_diverse,
    final_estimator=final_estimator_advanced,
    cv=5,  
    n_jobs=-1,
    passthrough=False 
)

print("Training Advanced Stacking Regressor...")
stacking_regressor_v2.fit(X_train, y_train) 

rmse(stacking_regressor_v2)

Training Advanced Stacking Regressor...
-------------------------
RMSE: 0.1295
-------------------------
