## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor

## Load Data

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

## Save Test IDs

In [3]:
test_ids = test["Id"]

## Combine Train and Test for Feature Engineering

In [4]:
all_data = pd.concat([train.drop("SalePrice", axis=1), test], sort=False)
all_data.drop("Id", axis=1, inplace=True)

## Impute Missing Values (Domain-Specific)

In [5]:
for col in all_data.columns:
    if all_data[col].dtype == "object":
        all_data[col] = all_data[col].fillna("None")
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())

## Label encode categorical features

In [6]:
for col in all_data.select_dtypes(include="object"):
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))

## Feature engineering

In [7]:
all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]
all_data["TotalBath"] = (
    all_data["FullBath"] + all_data["BsmtFullBath"] +
    0.5 * (all_data["HalfBath"] + all_data["BsmtHalfBath"])
)
all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]
all_data["RemodAge"] = all_data["YrSold"] - all_data["YearRemodAdd"]

all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]
all_data["TotalBath"] = (
    all_data["FullBath"] + all_data["BsmtFullBath"] +
    0.5 * (all_data["HalfBath"] + all_data["BsmtHalfBath"])
)
all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]
all_data["RemodAge"] = all_data["YrSold"] - all_data["YearRemodAdd"]


## Log-transform skewed features

In [8]:
skewed_feats = all_data.apply(lambda x: x.skew()).sort_values(ascending=False)
skewed = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed] = np.log1p(all_data[skewed])

## Split back to train/test

In [9]:
X = all_data.iloc[:train.shape[0], :]
X_test = all_data.iloc[train.shape[0]:, :]
y = np.log1p(train["SalePrice"])

## Standardize features

In [10]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

## Model tuning (example, tune further for best results)

In [11]:
model = XGBRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42
)

## Cross-validation for robust score

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=kf)
print("CV RMSE:", np.mean(cv_scores))

CV RMSE: 0.12485184972501502


## Train on full data

In [13]:
model.fit(X, y)
preds = np.expm1(model.predict(X_test))

## Submission

In [14]:
submission = pd.DataFrame({"Id": test_ids, "SalePrice": preds})
submission.to_csv("./data/submission.csv", index=False)
print("submission.csv ready! (./data/submission.csv)")

submission.csv ready! (./data/submission.csv)
