In [2]:
import os, sys
import numpy as np
import pandas as pd

from config_local import local_config

In [3]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV

In [4]:
train = pd.read_csv(local_config.TRAIN_PROCESS6_CSV)
test  = pd.read_csv(local_config.TEST_PROCESS6_CSV)
testRaw = pd.read_csv(local_config.TEST_CSV, index_col="Id")

In [5]:
y = train["logSP"]                 # your target column (log SalePrice)
X = train.drop(columns=["logSP"])  # all features except target

In [6]:
# ==== Cross-validation + Random Search ====
kf = KFold(n_splits=5, shuffle=True, random_state=42)

xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    eval_metric="rmse",
)

param_dist = {
    "n_estimators": [800, 1000, 1200],
    "learning_rate": [0.05, 0.04, 0.03],
    "max_depth": [3, 4, 5],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.7, 0.9],
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,                  # a bit more combos
    scoring="neg_mean_squared_error",
    cv=5,                       # more reliable CV
    n_jobs=-1,
    verbose=2,
    random_state=42,
)

print("Running RandomizedSearchCV for XGBoost...")
random_search.fit(X.values, y.values)   # <-- use .values to avoid feature name issues

print("\nBest params found:")
print(random_search.best_params_)

best_mse = -random_search.best_score_
best_rmse = best_mse ** 0.5
print(f"\nBest CV RMSE from RandomizedSearch: {best_rmse:.4f}")

Running RandomizedSearchCV for XGBoost...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best params found:
{'subsample': 0.8, 'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.03, 'colsample_bytree': 0.7}

Best CV RMSE from RandomizedSearch: 0.1156


In [8]:
# ==== Final model (already refit on all data) ====
best_model = random_search.best_estimator_

# ==== Predictions on test ====
test_pred_log = best_model.predict(test)

# Inverse log transform (same as LightGBM)
test_pred_real = np.exp(test_pred_log)
# or, if you used log1p: test_pred_real = np.expm1(test_pred_log)

# ==== Submission (same style as ElasticNet / LightGBM) ====
submission = pd.DataFrame({
    "Id": testRaw.index,
    "SalePrice": test_pred_real
})

out_path = os.path.join(local_config.SUBMISSIONS_DIR, "xgboost_Model.csv")
submission.to_csv(out_path, index=False)

print(f"Submission saved: {out_path}")

Submission saved: D:\Project\Kaggle\house-prices-starter\data\submissions\xgboost_Model.csv
