In [12]:
import os, sys
import numpy as np
import pandas as pd

from config_local import local_config

In [13]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [14]:
train = pd.read_csv(local_config.TRAIN_PROCESS6_CSV)
test  = pd.read_csv(local_config.TEST_PROCESS6_CSV)
testRaw = pd.read_csv(local_config.TEST_CSV, index_col="Id")

In [15]:
y = train["logSP"]                 # your target column (log SalePrice)
X = train.drop(columns=["logSP"])  # all features except target

In [16]:
# ==== Cross-validation + Random Search ====
kf = KFold(n_splits=5, shuffle=True, random_state=42)

lgbm = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1,
)

param_dist = {
    "num_leaves": [31, 63],
    "max_depth": [5, 7, -1],
    "learning_rate": [0.05, 0.04, 0.03],
    "n_estimators": [800, 1000, 1200],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.7, 0.9],
    "min_child_samples": [10, 20],
}

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=30,                  # a bit more combos
    scoring="neg_mean_squared_error",
    cv=5,                       # more reliable CV
    n_jobs=-1,
    verbose=2,
    random_state=42,
)


print("Running RandomizedSearchCV for LightGBM...")
random_search.fit(X, y)

print("\nBest params found:")
print(random_search.best_params_)

best_mse = -random_search.best_score_
best_rmse = best_mse ** 0.5
print(f"\nBest CV RMSE from RandomizedSearch: {best_rmse:.4f}")

Running RandomizedSearchCV for LightGBM...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3337
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 175
[LightGBM] [Info] Start training from score 12.024015

Best params found:
{'subsample': 1.0, 'num_leaves': 31, 'n_estimators': 800, 'min_child_samples': 20, 'max_depth': 5, 'learning_rate': 0.03, 'colsample_bytree': 0.7}

Best CV RMSE from RandomizedSearch: 0.1214


In [17]:

# ==== Final model (already refit on all data) ====
best_model = random_search.best_estimator_

# ==== Predictions on test ====
test_pred_log = best_model.predict(test)

# Inverse log transform
test_pred_real = np.exp(test_pred_log)

# ==== Submission (same style as ElasticNet) ====
submission = pd.DataFrame({
    "Id": testRaw.index,
    "SalePrice": test_pred_real
})

out_path = os.path.join(local_config.SUBMISSIONS_DIR, "lightGBM_Model.csv")
submission.to_csv(out_path, index=False)

print(f"Submission saved: {out_path}")

Submission saved: D:\Project\Kaggle\house-prices-starter\data\submissions\lightGBM_Model.csv
