In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import optuna

RANDOM_STATE = 42

# ===========================================
# 1) LOAD DATA
# ===========================================
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target.copy()
X.columns = X.columns.str.lower()

# ===========================================
# 2) WINSORIZING (IQR)
# ===========================================
def winsorize_iqr(df, k=1.5):
    w = df.copy()
    for col in df.columns:
        Q1 = w[col].quantile(0.25)
        Q3 = w[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        w[col] = np.clip(w[col], lower, upper)
    return w

X = winsorize_iqr(X)

# ===========================================
# 3) FEATURE ENGINEERING
# ===========================================
X["bedrooms_per_room"] = X["avebedrms"] / (X["averooms"] + 1e-6)
X["population_density"] = X["population"] / (X["aveoccup"] + 1e-6)
X["log_medinc"] = np.log1p(X["medinc"])

# KMeans GEO clustering
kmeans = KMeans(n_clusters=10, random_state=RANDOM_STATE)
X["geo_cluster"] = kmeans.fit_predict(X[["latitude", "longitude"]])

# KNN local price
knn = KNeighborsRegressor(n_neighbors=15, weights="distance")
knn.fit(X[["latitude", "longitude"]], y)
X["knn_price"] = knn.predict(X[["latitude", "longitude"]])

print("Feature count:", X.shape[1])

# TARGET LOG TRANSFORM
y_log = np.log1p(y)

# TRAIN/TEST SPLIT + SCALING
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y_log, test_size=0.2, random_state=RANDOM_STATE
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ===========================================
# 4) BASE MODELS
# ===========================================
xgb_model = xgb.XGBRegressor(
    n_estimators=1500, learning_rate=0.03,
    max_depth=6, subsample=0.9, colsample_bytree=0.8,
    tree_method="hist", random_state=RANDOM_STATE
)

lgb_model = lgb.LGBMRegressor(
    n_estimators=1500, learning_rate=0.03,
    num_leaves=80, subsample=0.9, colsample_bytree=0.8,
    random_state=RANDOM_STATE
)

cat_model = CatBoostRegressor(
    iterations=1500, learning_rate=0.03,
    depth=6, random_seed=RANDOM_STATE, verbose=False
)

models = [xgb_model, lgb_model, cat_model]

# ===========================================
# 5) K-FOLD OOF
# ===========================================
def get_oof(model):
    kf = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    oof = np.zeros_like(y_train)
    test_preds = np.zeros((3, len(y_test)))

    for i, (tr, val) in enumerate(kf.split(X_train)):
        m = model
        m.fit(X_train[tr], y_train[tr])
        oof[val] = m.predict(X_train[val])
        test_preds[i] = m.predict(X_test)

    return oof, test_preds.mean(axis=0)

oofs = []
tests = []

print("\nTraining base models...")
for m in models:
    o, t = get_oof(m)
    oofs.append(o)
    tests.append(t)
    print(f"  Model {m.__class__.__name__} OOF RMSE:", np.sqrt(mean_squared_error(y_train, o)))

# ===========================================
# 6) OPTUNA ENSEMBLE WEIGHTS
# ===========================================
def ensemble_rmse(weights):
    w = np.array(weights)
    w = w / w.sum()
    pred = np.zeros_like(oofs[0])
    for i in range(len(oofs)):
        pred += w[i] * oofs[i]
    return np.sqrt(mean_squared_error(y_train, pred))

def objective(trial):
    ws = [trial.suggest_float(f"w{i}", 0, 1) for i in range(len(models))]
    return ensemble_rmse(ws)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40, show_progress_bar=False)

best_w = np.array([study.best_params[f"w{i}"] for i in range(len(models))])
best_w = best_w / best_w.sum()
print("\nBest weights:", best_w)

# ===========================================
# 7) FINAL TEST PREDICTION
# ===========================================
ens_test = np.zeros_like(tests[0])
for w, t in zip(best_w, tests):
    ens_test += w * t

rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(ens_test)))
print("\n========================")
print(" FINAL TEST RMSE:", rmse)
print("========================")


Feature count: 13

Training base models...
  Model XGBRegressor OOF RMSE: 0.07509082241032099
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2869
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 13
[LightGBM] [Info] Start training from score 1.056618
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2869
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 13
[LightGBM] [Info] Start training from score 1.057530
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2868
[LightGBM] [Info] Number o

[I 2025-12-09 17:24:56,588] A new study created in memory with name: no-name-c2942b34-30ea-4dee-a692-6e9006babca8
[I 2025-12-09 17:24:56,594] Trial 0 finished with value: 0.07300901097451133 and parameters: {'w0': 0.824111534178575, 'w1': 0.618659728281595, 'w2': 0.8095639779271999}. Best is trial 0 with value: 0.07300901097451133.
[I 2025-12-09 17:24:56,597] Trial 1 finished with value: 0.07332519101656457 and parameters: {'w0': 0.8674805044309565, 'w1': 0.6124279250159466, 'w2': 0.5401562194178464}. Best is trial 0 with value: 0.07300901097451133.
[I 2025-12-09 17:24:56,599] Trial 2 finished with value: 0.07294660725813408 and parameters: {'w0': 0.36878701740290065, 'w1': 0.2898798409279797, 'w2': 0.4021737551495408}. Best is trial 2 with value: 0.07294660725813408.
[I 2025-12-09 17:24:56,601] Trial 3 finished with value: 0.07295126305656814 and parameters: {'w0': 0.1386305272745546, 'w1': 0.9235439121196705, 'w2': 0.7946375490288725}. Best is trial 2 with value: 0.07294660725813408.

  Model CatBoostRegressor OOF RMSE: 0.07212925033367072


[I 2025-12-09 17:24:56,798] Trial 31 finished with value: 0.07218302766489323 and parameters: {'w0': 0.08114951447012972, 'w1': 0.11619279142057216, 'w2': 0.6132526867921302}. Best is trial 24 with value: 0.07216513568015427.
[I 2025-12-09 17:24:56,807] Trial 32 finished with value: 0.07226469049904458 and parameters: {'w0': 0.07650688162936439, 'w1': 0.1381449690354789, 'w2': 0.47027092114941793}. Best is trial 24 with value: 0.07216513568015427.
[I 2025-12-09 17:24:56,817] Trial 33 finished with value: 0.07253317341690954 and parameters: {'w0': 0.2395685840912828, 'w1': 0.26402691081973795, 'w2': 0.5800276586143724}. Best is trial 24 with value: 0.07216513568015427.
[I 2025-12-09 17:24:56,823] Trial 34 finished with value: 0.07213177732764255 and parameters: {'w0': 0.08071851293088785, 'w1': 0.07546931318458584, 'w2': 0.787899445030191}. Best is trial 34 with value: 0.07213177732764255.
[I 2025-12-09 17:24:56,835] Trial 35 finished with value: 0.07218571918658392 and parameters: {'w0


Best weights: [0.085499   0.07993892 0.83456209]

 FINAL TEST RMSE: 0.25188199526878957
