<a href="https://colab.research.google.com/github/DEB-PROSAD-SEN/Kaggle_competition/blob/main/House_price_prediction_latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [105]:
import numpy as np
import pandas as pd
from scipy.stats import skew

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [107]:
test = pd.read_csv(r"/content/test (1).csv")
train = pd.read_csv(r"/content/train (1).csv")

train_ids = train["Id"]
test_ids = test["Id"]

y = np.log1p(train["SalePrice"])  # log-transform target
train.drop(["Id", "SalePrice"], axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)


In [108]:
outlier_idx = train[train["GrLivArea"] >= 4500].index
train = train.drop(outlier_idx).reset_index(drop=True)
y = y.drop(outlier_idx).reset_index(drop=True)


In [109]:
all_data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)


In [110]:
# Total square feet
all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]

# Total bathrooms
all_data["TotalBath"] = (all_data["FullBath"] + 0.5 * all_data["HalfBath"] +
                         all_data["BsmtFullBath"] + 0.5 * all_data["BsmtHalfBath"])

# Garage indicator
all_data["HasGarage"] = all_data["GarageType"].notnull().astype(int)

# Age features
all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]
all_data["RemodAge"] = all_data["YrSold"] - all_data["YearRemodAdd"]

# Overall quality metric
all_data["OverallGrade"] = all_data["OverallQual"] * all_data["OverallCond"]

# Interaction features
all_data["GrLivArea_OverallQual"] = all_data["GrLivArea"] * all_data["OverallQual"]
all_data["TotalSF_HouseAge"] = all_data["TotalSF"] / (all_data["HouseAge"] + 1)
all_data["GarageCars_GarageArea"] = all_data["GarageCars"] * all_data["GarageArea"]

# Cap extreme values
all_data["TotalSF_HouseAge"] = all_data["TotalSF_HouseAge"].clip(upper=5000)
all_data["GrLivArea_OverallQual"] = all_data["GrLivArea_OverallQual"].clip(upper=50000)
all_data["GarageCars_GarageArea"] = all_data["GarageCars_GarageArea"].clip(upper=2000)

# Drop less useful columns
if "Utilities" in all_data.columns:
    all_data.drop(["Utilities"], axis=1, inplace=True)


In [111]:
numeric_feats = all_data.select_dtypes(include=[np.number]).columns
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index

# Ensure values >= 0 before log1p
for feat in skewed_feats:
    all_data[feat] = all_data[feat].apply(lambda x: max(x, 0))
    all_data[feat] = np.log1p(all_data[feat])


In [112]:
categorical_features = all_data.select_dtypes(include=[object]).columns
for col in categorical_features:
    all_data[col] = all_data[col].astype(str)


In [113]:
n_train = train.shape[0]
train_features = all_data.iloc[:n_train, :].copy()
test_features = all_data.iloc[n_train:, :].copy()

# Replace inf/-inf and fill NaNs
train_features.replace([np.inf, -np.inf], np.nan, inplace=True)
test_features.replace([np.inf, -np.inf], np.nan, inplace=True)
train_features.fillna(0, inplace=True)
test_features.fillna(0, inplace=True)


In [114]:
numeric_features = train_features.select_dtypes(include=[np.number]).columns


In [115]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [116]:
base_models_1 = [
    ('ridge', Ridge(alpha=10)),
    ('lasso', Lasso(alpha=0.001)),
    ('elastic', ElasticNet(alpha=0.001, l1_ratio=0.7)),
    ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4,
                         subsample=0.8, colsample_bytree=0.8, random_state=42)),
    ('lgb', LGBMRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4,
                          subsample=0.8, colsample_bytree=0.8, random_state=42))
]

base_models_2 = [
    ('ridge', Ridge(alpha=15)),
    ('lasso', Lasso(alpha=0.0005)),
    ('elastic', ElasticNet(alpha=0.0005, l1_ratio=0.6)),
    ('xgb', XGBRegressor(n_estimators=2500, learning_rate=0.03, max_depth=5,
                         subsample=0.9, colsample_bytree=0.9, random_state=42)),
    ('lgb', LGBMRegressor(n_estimators=2500, learning_rate=0.03, max_depth=5,
                          subsample=0.9, colsample_bytree=0.9, random_state=42))
]


In [117]:
stacked_model_1 = StackingRegressor(
    estimators=base_models_1,
    final_estimator=Lasso(alpha=0.0005),
    cv=5,
    n_jobs=-1
)

stacked_model_2 = StackingRegressor(
    estimators=base_models_2,
    final_estimator=Lasso(alpha=0.0003),
    cv=5,
    n_jobs=-1
)


In [118]:
pipeline_1 = Pipeline([('preprocessor', preprocessor), ('regressor', stacked_model_1)])
pipeline_2 = Pipeline([('preprocessor', preprocessor), ('regressor', stacked_model_2)])


In [119]:
pipeline_1.fit(train_features, y)
pipeline_2.fit(train_features, y)


In [120]:
preds_1 = pipeline_1.predict(test_features)
preds_2 = pipeline_2.predict(test_features)

final_preds = (preds_1 + preds_2) / 2

submission = pd.DataFrame({"Id": test_ids, "SalePrice": np.expm1(final_preds)})
submission.to_csv("submission_upgraded_fixed.csv", index=False)


