<a href="https://colab.research.google.com/github/DEB-PROSAD-SEN/Kaggle_competition/blob/main/House_price_prediction_advance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [2]:
train = pd.read_csv(r"/content/train (1).csv")
test = pd.read_csv(r"/content/test (1).csv")

train_ids = train["Id"]
test_ids = test["Id"]

y = np.log1p(train["SalePrice"])  # Log-transform target
train.drop(["Id", "SalePrice"], axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)


In [3]:
all_data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)


In [14]:
# Total square feet
all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]

# Total bathrooms
all_data["TotalBath"] = (all_data["FullBath"] + 0.5 * all_data["HalfBath"] +
                         all_data["BsmtFullBath"] + 0.5 * all_data["BsmtHalfBath"])

# Garage indicator
all_data["HasGarage"] = all_data["GarageType"].notnull().astype(int)

# Age features
all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]
all_data["RemodAge"] = all_data["YrSold"] - all_data["YearRemodAdd"]

# Overall quality metric
all_data["OverallGrade"] = all_data["OverallQual"] * all_data["OverallCond"]

# Drop redundant columns
all_data.drop(["Utilities"], axis=1, inplace=True)  # optional

KeyError: "['Utilities'] not found in axis"

In [12]:
numeric_features = all_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = all_data.select_dtypes(include=[object]).columns.tolist()

# Remove the target variable from numeric features if it exists
if 'SalePrice' in numeric_features:
    numeric_features.remove('SalePrice')

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for feature engineering
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        # Total square feet
        X_transformed["TotalSF"] = X_transformed["TotalBsmtSF"] + X_transformed["1stFlrSF"] + X_transformed["2ndFlrSF"]

        # Total bathrooms
        X_transformed["TotalBath"] = (X_transformed["FullBath"] + 0.5 * X_transformed["HalfBath"] +
                                     X_transformed["BsmtFullBath"] + 0.5 * X_transformed["BsmtHalfBath"])

        # Garage indicator
        X_transformed["HasGarage"] = X_transformed["GarageType"].notnull().astype(int)

        # Age features
        X_transformed["HouseAge"] = X_transformed["YrSold"] - X_transformed["YearBuilt"]
        X_transformed["RemodAge"] = X_transformed["YrSold"] - X_transformed["YearRemodAdd"]

        # Overall quality metric
        X_transformed["OverallGrade"] = X_transformed["OverallQual"] * X_transformed["OverallCond"]

        # Drop redundant columns (optional - depends on whether it's useful or not)
        if 'Utilities' in X_transformed.columns:
          X_transformed.drop(["Utilities"], axis=1, inplace=True)

        return X_transformed

# Numeric pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Full preprocessor
from sklearn.compose import ColumnTransformer

# Update feature lists to include engineered features.
# Since the FeatureEngineer will be applied first, we need the feature lists
# to reflect the columns *after* feature engineering.
# However, we can't determine these definitively without fitting the transformer.
# A simpler approach is to use the column names from all_data after engineering
# but be aware this assumes the feature engineering doesn't drop or add columns
# in a way that makes the column names inconsistent between train and test after splitting.
# A more robust approach would involve fitting the FeatureEngineer on a small sample
# to get the transformed column names, but for this case, using all_data columns
# after engineering as a proxy should work.

# Re-defining numeric and categorical features after applying feature engineering
# to all_data to get the correct column names for the preprocessor.
# This assumes the feature engineering creates consistent columns for train/test.
temp_all_data_engineered = FeatureEngineer().fit_transform(all_data)
numeric_features_engineered = temp_all_data_engineered.select_dtypes(include=[np.number]).columns.tolist()
categorical_features_engineered = temp_all_data_engineered.select_dtypes(include=[object]).columns.tolist()

# Remove the target variable from numeric features if it exists
if 'SalePrice' in numeric_features_engineered:
    numeric_features_engineered.remove('SalePrice')


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features_engineered),
    ('cat', categorical_transformer, categorical_features_engineered)
])

In [7]:
base_models = [
    ('ridge', Ridge(alpha=10)),
    ('lasso', Lasso(alpha=0.001)),
    ('elastic', ElasticNet(alpha=0.001, l1_ratio=0.7)),
    ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8, random_state=42)),
    ('lgb', LGBMRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8, random_state=42))
]


In [8]:
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Lasso(alpha=0.0005),
    cv=5,
    n_jobs=-1
)


In [15]:
model_pipeline = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer()), # Add the feature engineering step
    ('preprocessor', preprocessor),
    ('regressor', stacked_model)
])

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_pipeline, train, y, scoring='neg_root_mean_squared_error', cv=kf)
print("CV RMSE:", -scores.mean())



CV RMSE: 0.12926416181883782




In [17]:
model_pipeline.fit(train, y)


In [18]:
preds = model_pipeline.predict(test)
submission = pd.DataFrame({"Id": test_ids, "SalePrice": np.expm1(preds)})
submission.to_csv("submission.csv", index=False)


