In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [81]:
!pip -q install optuna lightgbm catboost

In [44]:
##Load Data

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

TRAIN_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
TEST_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"
SAMPLE_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [45]:
#Split Features

y = train["SalePrice"]
X = train.drop(["SalePrice", "Id"], axis=1)
X_test_final = test.drop(["Id"], axis=1)

In [56]:
#Log-transform target for stability
y = np.log1p(y)

In [57]:
# ---- Add engineered features to BOTH train and test ----
def add_features(df):
    df = df.copy()
    # robust helpers (use .get so it won’t crash if a col is missing)
    df["TotalSF"]     = df.get("TotalBsmtSF", 0) + df.get("1stFlrSF", 0) + df.get("2ndFlrSF", 0)
    df["TotalBath"]   = df.get("FullBath", 0) + 0.5*df.get("HalfBath", 0) + df.get("BsmtFullBath", 0)
    df["HouseAge"]    = df.get("YrSold", 0) - df.get("YearBuilt", 0)
    df["RemodAge"]    = df.get("YrSold", 0) - df.get("YearRemodAdd", 0)
    df["OverallQual_SF"] = df.get("OverallQual", 0) * df.get("GrLivArea", 0)
    # simple flags
    df["HasGarage"] = (df.get("GarageArea", 0) > 0).astype(int)
    df["HasBasement"] = (df.get("TotalBsmtSF", 0) > 0).astype(int)
    df["HasFireplace"]= (df.get("Fireplaces", 0) > 0).astype(int)
    return df

X = add_features(X)
X_test_final = add_features(X_test_final)


  return op(a, b)
  return op(a, b)


In [58]:
#Separate Categorical from Numeric
categorical = X.select_dtypes(include=["object"]).columns
numeric = X.select_dtypes(exclude=["object"]).columns

In [67]:
#Preproccessor

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric),
        ("cat", categorical_transformer, categorical)
    ]
)

In [None]:
#Model XGBoost

model = XGBRegressor(
     n_estimators=1538
    ,learning_rate=0.01343
    ,max_depth=4
    ,subsample=0.6962
    ,colsample_bytree=0.6221
    ,reg_alpha=0.108
    ,reg_lambda=1.145
    ,min_child_weight=7
    ,random_state=42
    ,n_jobs=-1
    ,tree_method="hist"
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor), 
                          ("model", model)])

In [71]:
#Grid Search
from sklearn.model_selection import GridSearchCV, KFold

param_grid = {
    "model__n_estimators": [300, 600, 900],
    "model__max_depth": [3, 4, 5],
    "model__learning_rate": [0.02, 0.05, 0.08],
    "model__subsample": [0.7, 0.8, 1.0],
    "model__colsample_bytree": [0.7, 0.8, 1.0],
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=cv, n_jobs=-1, verbose=1
)
grid.fit(X, y)
print("Best params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)
best_pipeline = grid.best_estimator_

scores = cross_val_score(best_pipeline, X, y, scoring="neg_root_mean_squared_error",
                        cv=cv, n_jobs=-1)
print(f"CV RMSE (log-price): {-scores.mean():.4f} +/- {scores.std():.4f}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best params: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 900, 'model__subsample': 0.7}
Best CV RMSE: 0.009592154462460738
CV RMSE (log-price): 0.0096 +/- 0.0012


In [85]:
#Optuna Optimization

import optuna
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

SCORING = "neg_root_mean_squared_error" # CV metric (lower RMSE = better)

def cv_rmse(pipe):
    scores = cross_val_score(pipe, X, y, cv=cv, scoring=SCORING, n_jobs=-1)
    return -scores.mean()

def make_pipeline(model):
    return Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

def tune_with_optuna(model_name, n_trials=40, random_state=42):
    def objective(trial):
        if model_name == "xgb":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 600, 3000),
                learning_rate=trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
                max_depth=trial.suggest_int("max_depth", 2, 10),
                subsample=trial.suggest_float("subsample", 0.5, 1.0),
                colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
                reg_alpha=trial.suggest_float("reg_alpha", 0.0, 5.0),
                reg_lambda=trial.suggest_float("reg_lambda", 0.0, 5.0),
                min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
                tree_method="hist",
                n_jobs=-1,
                random_state=random_state,
                verbosity=0
            )
            model = XGBRegressor(**params)
        elif model_name == "lgbm":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 800, 5000),
                learning_rate=trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
                num_leaves=trial.suggest_int("num_leaves", 16, 256),
                max_depth=trial.suggest_int("max_depth", -1, 12),
                subsample=trial.suggest_float("subsample", 0.5, 1.0),
                colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
                min_child_samples=trial.suggest_int("min_child_samples", 5, 50),
                reg_alpha=trial.suggest_float("reg_alpha", 0.0, 5.0),
                reg_lambda=trial.suggest_float("reg_lambda", 0.0, 5.0),
                objective="regression",
                n_jobs=-1,
                random_state=random_state,
                verbose=0
            )
            model = LGBMRegressor(**params)
        elif model_name == "cat":
            params = dict(
                iterations=trial.suggest_int("iterations", 800, 4000),
                learning_rate=trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
                depth=trial.suggest_int("depth", 4, 10),
                l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
                bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 1.0),
                loss_function="RMSE",
                random_state=random_state,
                verbose=False
            )
            model = CatBoostRegressor(**params)
        elif model_name == "rf":
            params = dict(
                n_estimators=trial.suggest_int("n_estimators", 400, 1600),
                max_depth=trial.suggest_int("max_depth", 6, 30),
                max_features=trial.suggest_categorical("max_features", ["sqrt", "log2", 0.5, 0.7, 0.9]),
                min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 5),
                n_jobs=-1,
                random_state=random_state,
            )
            model = RandomForestRegressor(**params)
        elif model_name == "elastic":
            params = dict(
                alpha=trial.suggest_float("alpha", 1e-4, 10.0, log=True),
                l1_ratio=trial.suggest_float("l1_ratio", 0.0, 1.0),
                max_iter=20000,
                random_state=random_state
            )
            model = ElasticNet(**params)
        else:
            raise ValueError("Unknown model_name")

        pipe = make_pipeline(model)
        return cv_rmse(pipe)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    print(f"{model_name} best RMSE:", study.best_value)
    print(f"{model_name} best params:", study.best_params)

    # Build the best model/pipeline and refit on all data
    best_model = (
        XGBRegressor(tree_method="hist", n_jobs=-1, random_state=42, **study.best_params) if model_name=="xgb" else
        LGBMRegressor(objective="regression", n_jobs=-1, random_state=42, **study.best_params) if model_name=="lgbm" else
        CatBoostRegressor(loss_function="RMSE", random_state=42, verbose=False, **study.best_params) if model_name=="cat" else
        RandomForestRegressor(n_jobs=-1, random_state=42, **study.best_params) if model_name=="rf" else
        ElasticNet(max_iter=20000, random_state=42, **study.best_params)
    )

    best_pipe = make_pipeline(best_model)
    best_pipe.fit(X, y)
    return study.best_value, best_pipe

In [89]:
#Model Tuning

rmse_xgb, pipe_xgb = tune_with_optuna("xgb", n_trials=2)
rmse_cat, pipe_cat = tune_with_optuna("cat", n_trials=2)
rmse_rf, pipe_rf = tune_with_optuna("rf", n_trials=2)
rmse_en, pipe_en = tune_with_optuna("elastic", n_trials=4) # linear-ish baseline

[I 2025-08-19 18:31:51,943] A new study created in memory with name: no-name-1185d4ba-2b6d-4bf2-82b0-96a7e5f1038d
[I 2025-08-19 18:31:56,640] Trial 0 finished with value: 0.01828748125053436 and parameters: {'n_estimators': 1639, 'learning_rate': 0.03729953474357719, 'max_depth': 2, 'subsample': 0.534442600686247, 'colsample_bytree': 0.6657290882384869, 'reg_alpha': 2.691356439065573, 'reg_lambda': 1.3573650869402858, 'min_child_weight': 6}. Best is trial 0 with value: 0.01828748125053436.
[I 2025-08-19 18:31:58,408] Trial 1 finished with value: 0.01907823700117493 and parameters: {'n_estimators': 1439, 'learning_rate': 0.17315751700933962, 'max_depth': 5, 'subsample': 0.9608284810654639, 'colsample_bytree': 0.5508316432485123, 'reg_alpha': 4.624524458300815, 'reg_lambda': 3.151033550345112, 'min_child_weight': 7}. Best is trial 0 with value: 0.01828748125053436.


xgb best RMSE: 0.01828748125053436
xgb best params: {'n_estimators': 1639, 'learning_rate': 0.03729953474357719, 'max_depth': 2, 'subsample': 0.534442600686247, 'colsample_bytree': 0.6657290882384869, 'reg_alpha': 2.691356439065573, 'reg_lambda': 1.3573650869402858, 'min_child_weight': 6}


[I 2025-08-19 18:31:59,294] A new study created in memory with name: no-name-4cb257cb-db37-47af-bb51-9bfa5ea2643e
[I 2025-08-19 18:32:18,907] Trial 0 finished with value: 0.010262338875935837 and parameters: {'iterations': 1123, 'learning_rate': 0.008455146216847837, 'depth': 4, 'l2_leaf_reg': 7.436269445495387, 'bagging_temperature': 0.7816580795268946}. Best is trial 0 with value: 0.010262338875935837.
[I 2025-08-19 18:34:10,520] Trial 1 finished with value: 0.00957426284014209 and parameters: {'iterations': 2407, 'learning_rate': 0.03465784217254741, 'depth': 7, 'l2_leaf_reg': 5.486829064854026, 'bagging_temperature': 0.4631358191156607}. Best is trial 1 with value: 0.00957426284014209.


cat best RMSE: 0.00957426284014209
cat best params: {'iterations': 2407, 'learning_rate': 0.03465784217254741, 'depth': 7, 'l2_leaf_reg': 5.486829064854026, 'bagging_temperature': 0.4631358191156607}


[I 2025-08-19 18:34:30,808] A new study created in memory with name: no-name-a065d36b-0948-4c52-9570-6fec13128c16
[I 2025-08-19 18:34:45,321] Trial 0 finished with value: 0.011002217620392569 and parameters: {'n_estimators': 1151, 'max_depth': 11, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.011002217620392569.
[I 2025-08-19 18:35:22,314] Trial 1 finished with value: 0.010855702534769543 and parameters: {'n_estimators': 540, 'max_depth': 13, 'max_features': 0.9, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.010855702534769543.


rf best RMSE: 0.010855702534769543
rf best params: {'n_estimators': 540, 'max_depth': 13, 'max_features': 0.9, 'min_samples_split': 6, 'min_samples_leaf': 5}


[I 2025-08-19 18:35:36,253] A new study created in memory with name: no-name-ef6f7f37-061e-4d60-b342-5e21bf28b66b
[I 2025-08-19 18:35:37,083] Trial 0 finished with value: 0.014990282253553425 and parameters: {'alpha': 0.255469995468009, 'l1_ratio': 0.28115274131067136}. Best is trial 0 with value: 0.014990282253553425.
[I 2025-08-19 18:35:41,313] Trial 1 finished with value: 0.012720202854787768 and parameters: {'alpha': 0.001336671297321851, 'l1_ratio': 0.6919945142237351}. Best is trial 1 with value: 0.012720202854787768.
[I 2025-08-19 18:35:42,180] Trial 2 finished with value: 0.014317086150148772 and parameters: {'alpha': 0.02892065928862604, 'l1_ratio': 0.24919607780936293}. Best is trial 1 with value: 0.012720202854787768.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
[I 2025-08-19 18:36:11,655] Trial 3 finished with value: 0.011328678

elastic best RMSE: 0.011328678184941524
elastic best params: {'alpha': 0.00010789618989867691, 'l1_ratio': 0.9935030452682905}


  model = cd_fast.sparse_enet_coordinate_descent(


In [90]:
# Predict (in log space) with each tuned model
preds_log_xgb = pipe_xgb.predict(X_test_final)
preds_log_cat = pipe_cat.predict(X_test_final)
preds_log_rf = pipe_rf.predict(X_test_final)
preds_log_en = pipe_en.predict(X_test_final)

# ==== Ensemble ====
# Option A: simple average
preds_log_blend = (preds_log_xgb + preds_log_cat + preds_log_rf + preds_log_en) / 4.0

# Option B: performance-weighted average (better RMSE -> higher weight)
rmse_list = np.array([rmse_xgb, rmse_cat, rmse_rf, rmse_en], dtype=float)
weights = (1.0 / (rmse_list ** 2))
weights = weights / weights.sum()
preds_log_weighted = (weights[0]*preds_log_xgb +
                      weights[1]*preds_log_cat + weights[2]*preds_log_rf +
                      weights[3]*preds_log_en)

# Choose which to use for submission:
use_weighted = True
preds_log_final = preds_log_weighted if use_weighted else preds_log_blend

# Invert log1p to dollars
preds_final = np.expm1(preds_log_final) 

# Clip to non-negative
preds_final = np.clip(preds_final, 0, None)

In [91]:
from sklearn.model_selection import cross_val_score

candidates = {
    "Baseline XGB": pipeline,
    "Tuned XGB": best_pipeline,
    # "Stacked": stacked_pipeline, # if you created it
}
for name, pipe in candidates.items():
    scores = -cross_val_score(pipe, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1)
    print(f"{name:15s} CV RMSE: {scores.mean():.5f} (+/- {scores.std():.5f})")

Baseline XGB    CV RMSE: 0.00974 (+/- 0.00123)
Tuned XGB       CV RMSE: 0.00959 (+/- 0.00124)


In [92]:
#submission

sub = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
sub["SalePrice"] = preds_final
sub.to_csv("submission.csv", index=False)
print("Saved submission.csv — Ensemble of XGB + LGBM + CatBoost + RF + ElasticNet")

Saved submission.csv — Ensemble of XGB + LGBM + CatBoost + RF + ElasticNet
