In [3]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np, pandas as pd, joblib, json, os 
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [2]:
train=pd.read_csv("../data/train.csv")
X=train.drop(columns=["SalePrice","Id"])
y=np.log1p(train["SalePrice"])

In [4]:
test=pd.read_csv("../data/test.csv")
X_test=test.drop(columns=["Id"])

In [5]:
num_cols= selector(dtype_include=np.number)(X)
cat_cols= selector(dtype_exclude=np.number)(X)

In [6]:
numeric_tf= Pipeline([
    ("imp",SimpleImputer(strategy="median")),
    ("scale",StandardScaler())
])
categorical_tf= Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])

In [8]:
preprocessor=ColumnTransformer([
    ("num",numeric_tf,num_cols),
    ("cat",categorical_tf,cat_cols)
])

In [9]:
best_params={
    "n_estimators": 600,
    "max_depth": 5,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 3
}
pipe= Pipeline([
    ("pre", preprocessor),
    ("model",XGBRegressor(random_state=42, n_jobs=-1, **best_params))
])

pipe.fit(X,y)
print("Model Training complete on full data")

Model Training complete on full data


In [11]:
test_preds_log = pipe.predict(X_test)
test_preds=np.expm1(test_preds_log)  #reverse of log1p

submission = pd.DataFrame({
    "Id": test.Id,
    "SalePrice": test_preds
})

os.makedirs("../output", exist_ok=True)
submission.to_csv("../output/submission.csv", index=False)

print("submnission.csv created successfully in 'output/' folder! ")
print(submission.head())

submnission.csv created successfully in 'output/' folder! 
     Id      SalePrice
0  1461  125958.687500
1  1462  152936.140625
2  1463  188810.109375
3  1464  195431.625000
4  1465  185454.531250


In [12]:
os.makedirs("../models", exist_ok=True)
joblib.dump(pipe, "../models/xgb_final_model.joblib")

os.makedirs("../reports", exist_ok=True)
with open("../reports/best_params.json", "w") as f:
    json.dump(best_params, f, indent=4)

print("Model and parameters saved successfully.")

Model and parameters saved successfully.


In [13]:
print("Submission file shape:", submission.shape)
print("Any nulls?", submission.isnull().sum().sum())
print("Predictions range:", submission["SalePrice"].min(), "-", submission["SalePrice"].max())

Submission file shape: (1459, 2)
Any nulls? 0
Predictions range: 43637.28125 - 569053.1875
