In [14]:
import numpy as np
import pandas as pd
import warnings
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

warnings.filterwarnings('ignore')

df = pd.read_csv("houseprices.csv")
sdf = pd.read_csv("test.csv")

df = df.drop(df[(df['GrLivArea'] > 4000) & (df['SalePrice'] < 300000)].index)

def engineer_features(frame):
    f = frame.copy()
    f['TotalSF'] = f['TotalBsmtSF'].fillna(0) + f['1stFlrSF'].fillna(0) + f['2ndFlrSF'].fillna(0)
    f['TotalBath'] = f['FullBath'].fillna(0) + (0.5 * f['HalfBath'].fillna(0)) + \
                     f['BsmtFullBath'].fillna(0) + (0.5 * f['BsmtHalfBath'].fillna(0))
    f['YearBlfMod'] = f['YearBuilt'] + f['YearRemodAdd']
    f['TotalQual'] = f['OverallQual'] * f['GrLivArea']
    f['TotalPorchSF'] = f['OpenPorchSF'] + f['EnclosedPorch'] + f['3SsnPorch'] + f['ScreenPorch']
    f['Has2ndFlr'] = f['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    return f

X = engineer_features(df.drop(columns=['Id', 'SalePrice']))
y = df['SalePrice']
X_submission = engineer_features(sdf.drop(columns=['Id']))

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
        ('cat', categorical_transformer, make_column_selector(dtype_exclude=np.number))
    ])

xgb = XGBRegressor(n_estimators=3000, learning_rate=0.01, max_depth=4, subsample=0.7, colsample_bytree=0.7, n_jobs=-1, random_state=42)
lgbm = LGBMRegressor(n_estimators=3000, learning_rate=0.01, max_depth=4, num_leaves=31, subsample=0.7, colsample_bytree=0.7, random_state=42, verbosity=-1)
catb = CatBoostRegressor(iterations=3000, learning_rate=0.01, depth=4, l2_leaf_reg=3, loss_function='RMSE', random_seed=42, verbose=False)
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01, max_depth=4, max_features='sqrt', loss='huber', random_state=42)

stack = StackingRegressor(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('catb', catb), ('gbr', gbr)],
    final_estimator=RidgeCV(),
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

model = TransformedTargetRegressor(
    regressor=Pipeline(steps=[('pre', preprocessor), ('reg', stack)]),
    func=np.log1p,
    inverse_func=np.expm1
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
r2_scores = cross_val_score(model, X, y, scoring="r2", cv=kf)

print(f"Blended CV RMSE: {rmse_scores.mean():.4f}")
print(f"Blended CV R2: {r2_scores.mean():.4f}")

model.fit(X, y)

ridge_pipe = TransformedTargetRegressor(Pipeline([('pre', preprocessor), ('reg', RidgeCV())]), func=np.log1p, inverse_func=np.expm1).fit(X, y)
lasso_pipe = TransformedTargetRegressor(Pipeline([('pre', preprocessor), ('reg', LassoCV())]), func=np.log1p, inverse_func=np.expm1).fit(X, y)

p1 = model.predict(X_submission)
p2 = ridge_pipe.predict(X_submission)
p3 = lasso_pipe.predict(X_submission)

final_preds = (0.70 * p1) + (0.15 * p2) + (0.15 * p3)

pd.DataFrame({'Id': sdf['Id'], 'SalePrice': final_preds}).to_csv('submission.csv', index=False)

Blended CV RMSE: 20707.3480
Blended CV R2: 0.9316
