In [10]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np,pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [8]:
train=pd.read_csv("../data/train.csv")
X=train.drop(columns=["SalePrice","Id"])
y=np.log1p(train["SalePrice"])

In [12]:
num_cols= selector(dtype_include=np.number)(X)
cat_cols= selector(dtype_exclude=np.number)(X)

In [13]:
numeric_tf= Pipeline([
    ("imp",SimpleImputer(strategy="median")),
    ("scale",StandardScaler())
])
categorical_tf= Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])

In [14]:
preprocessor=ColumnTransformer([
    ("num",numeric_tf,num_cols),
    ("cat",categorical_tf,cat_cols)
])

In [15]:
xgb=XGBRegressor(random_state=42, n_jobs=-1)
pipe= Pipeline([
    ("pre", preprocessor),
    ("model",xgb)
])

In [18]:
param_dist={
    "model__n_estimators": [200,400,600,800],
    "model__max_depth": [3,4,5,6,7],
    "model__learning_rate": [0.01,0.05,0.1],
    "model__subsample": [0.6,0.8,1.0],
    "model__colsample_bytree": [0.6,0.8,1.0],
    "model__min_child_weight": [1,3,5]
}

X_train,X_val,y_train,y_val= train_test_split(X,y,test_size=0.2,random_state=42)



In [21]:
rs= RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rs.fit(X_train,y_train)
best_model=rs.best_estimator_
val_preds= best_model.predict(X_val)
rmse= np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.4f}")
print("Best Cross-Validation RMSE:", -rs.best_score_)
print("Best Params:", rs.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Validation RMSE: 0.1331
Best Cross-Validation RMSE: 0.12492494873613373
Best Params: {'model__subsample': 0.6, 'model__n_estimators': 400, 'model__min_child_weight': 1, 'model__max_depth': 4, 'model__learning_rate': 0.05, 'model__colsample_bytree': 0.8}


In [22]:
import joblib, json
joblib.dump(rs.best_estimator_, "../models/xgb_best.joblib")
with open("../reports/tuning_results.json", "w") as f:
    json.dump(rs.best_params_, f, indent=4)