In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate(y_true, y_pred):
    print(f"MAE: ${mean_absolute_error(y_true, y_pred):,.3f}")
    print(f"RMSE: ${mean_squared_error(y_true, y_pred, squared=False):,.3f}")
    print(f"R²: {r2_score(y_true, y_pred):.3f}")


In [3]:
# Feature engineering function
def create_features(df):
    # Original top features plus any needed for engineering
    base_features = [
        'OverallQual', 'Neighborhood', 'GrLivArea', 'TotalBsmtSF', 'GarageArea',
        'YearBuilt', 'GarageCars', 'BsmtQual', 'ExterQual', 'KitchenQual',
        'GarageYrBlt', 'MSSubClass', 'GarageFinish', 'YearRemodAdd', 
        'TotRmsAbvGrd', 'LotFrontage', 'YrSold', '1stFlrSF', '2ndFlrSF',
        'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'  # Added for feature engineering
    ]
    
    # Create new features
    df = df.copy()  # Create a copy to avoid SettingWithCopyWarning
    df['TotalSF'] = np.log1p(df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'])
    df['Age'] = (df['YrSold'] - df['YearBuilt'])**2
    df['TotalBath'] = df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath'] + 0.5*df['BsmtHalfBath']
    df['QualxArea'] = df['OverallQual'] * df['GrLivArea']
    df['HasPool'] = (df['PoolArea'] > 0).astype(int)
    df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
    df['LotFrontage'] = np.log1p(df['LotFrontage'].fillna(df['LotFrontage'].median()))
    
    # Final selected features
    selected_features = [
        'OverallQual', 'Neighborhood', 'GrLivArea', 'TotalBsmtSF', 'GarageArea',
        'YearBuilt', 'GarageCars', 'BsmtQual', 'ExterQual', 'KitchenQual',
        'GarageYrBlt', 'MSSubClass', 'GarageFinish', 'YearRemodAdd',
        'TotRmsAbvGrd', 'LotFrontage', 'TotalSF', 'Age', 
        'TotalBath', 'QualxArea', 'HasPool', 'HasFireplace'
    ]
    
    return df[selected_features]

In [4]:
# Load the data
data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')  # Replace with your actual file path
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
# Define target variable (log transform)
y = np.log(data['SalePrice'])
X = data.drop(['SalePrice', 'Id'], axis=1)  # Remove ID as it's not a feature

# Filter features
X = create_features(data)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor

# Define preprocessing for Ridge model (needs scaling + one-hot)
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# For RF and XGB (no scaling needed)
num_simple = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

cat_simple = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_gb = ColumnTransformer([
    ('num', num_simple, numerical_cols),
    ('cat', cat_simple, categorical_cols)
])


# Individual Models
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RidgeCV(alphas=np.logspace(-3, 3, 50), cv=5))
])

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor_gb),  # Using GB preprocessor
    ('regressor', RandomForestRegressor(
        n_estimators=300,
        max_depth=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor_gb),  # Using GB preprocessor
    ('regressor', XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        random_state=42
    ))
])

model = StackingRegressor(
    estimators=[
        ('ridge', ridge_model),
        ('rf', rf_model),
        ('xgb', xgb_model)
    ],
    final_estimator=RidgeCV()
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Stacked Ensemble RMSE: {rmse_stack:.4f}")

Stacked Ensemble RMSE: 0.1426


In [5]:
evaluate(y_test, y_pred)

MAE: $0.094
RMSE: $0.143
R²: 0.891


In [6]:
test_ids = test['Id']
X_submit = create_features(test).copy()
y_submit_log = model.predict(X_submit)
y_submit = np.exp(y_submit_log)  # Undo log transformation

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_submit
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv


  result = getattr(ufunc, method)(*inputs, **kwargs)
