<a href="https://colab.research.google.com/github/DEB-PROSAD-SEN/Kaggle_competition/blob/main/House%20price%20prediction%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install catboost



In [14]:
!pip install Optuna



In [15]:
# ==== BLOCK 1: Imports & Setup ====
import numpy as np
import pandas as pd
from typing import List, Dict, Optional

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.compose import TransformedTargetRegressor

import warnings
warnings.filterwarnings("ignore")

# Optional libs (may not be installed)
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    import lightgbm as lgb
except Exception:
    lgb = None

try:
    from catboost import CatBoostRegressor
except Exception:
    CatBoostRegressor = None

try:
    import optuna
except Exception:
    optuna = None

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def rmsle(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)


In [16]:
# ==== BLOCK 2: Load & prune ====
FILE_PATH = r"/content/train (1).csv"  # adjust if needed
df = pd.read_csv(FILE_PATH)

# Drop columns with >50% missing values (robust baseline)
threshold = 0.5 * len(df)
df = df.dropna(thresh=threshold, axis=1).copy()

target_col = df.columns[-1]    # target is last column (SalePrice)
y = df[target_col].values
X = df.drop(columns=[target_col])
print("Data shape:", X.shape, "| Target:", target_col)


Data shape: (1460, 75) | Target: SalePrice


In [17]:
# ==== BLOCK 3: Outlier handling (Winsorization) ====
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.01, upper=0.99):
        self.lower = lower
        self.upper = upper
        self.bounds_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        num_cols = X.select_dtypes(exclude=["object"]).columns
        for c in num_cols:
            lo = X[c].quantile(self.lower)
            hi = X[c].quantile(self.upper)
            self.bounds_[c] = (lo, hi)
        return self

    def transform(self, X):
        X = X.copy()
        for c, (lo, hi) in self.bounds_.items():
            if c in X.columns:
                X[c] = X[c].clip(lo, hi)
        return X


In [18]:
# ==== BLOCK 4: Feature Engineering ====

class FeatureEngineer(BaseEstimator, TransformerMixin):
    """ Adds domain features commonly helpful for Ames dataset. """
    def __init__(self):
        self.lf_by_nbhd_ = None

    def fit(self, X, y=None):
        X = X.copy()
        if "LotFrontage" in X.columns and "Neighborhood" in X.columns:
            self.lf_by_nbhd_ = X.groupby("Neighborhood")["LotFrontage"].median()
        return self

    def transform(self, X):
        X = X.copy()

        # Neighborhood-median impute for LotFrontage
        if self.lf_by_nbhd_ is not None and "LotFrontage" in X.columns:
            mask = X["LotFrontage"].isna()
            X.loc[mask, "LotFrontage"] = X.loc[mask, "Neighborhood"].map(self.lf_by_nbhd_)

        def has(col): return col in X.columns

        # Core totals & ages
        if all(has(c) for c in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF"]):
            X["TotalSF"] = X["TotalBsmtSF"].fillna(0) + X["1stFlrSF"].fillna(0) + X["2ndFlrSF"].fillna(0)
        if all(has(c) for c in ["YrSold","YearBuilt"]):
            X["HouseAge"] = X["YrSold"] - X["YearBuilt"]
        if all(has(c) for c in ["YrSold","YearRemodAdd"]):
            X["RemodAge"] = X["YrSold"] - X["YearRemodAdd"]
        if all(has(c) for c in ["YrSold","GarageYrBlt"]):
            X["GarageAge"] = X["YrSold"] - X["GarageYrBlt"].fillna(X.get("YearBuilt", X["YrSold"]))

        # Bathrooms
        if all(has(c) for c in ["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]):
            X["TotalBath"] = (X["FullBath"].fillna(0) + 0.5*X["HalfBath"].fillna(0) +
                              X["BsmtFullBath"].fillna(0) + 0.5*X["BsmtHalfBath"].fillna(0))

        # Porches & decks
        porch_cols = [c for c in ["OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","WoodDeckSF"] if has(c)]
        if porch_cols:
            X["TotalPorchSF"] = X[porch_cols].fillna(0).sum(axis=1)

        # Flags
        if has("GarageArea"):
            X["HasGarage"] = (X["GarageArea"].fillna(0) > 0).astype(int)
        if has("TotalBsmtSF"):
            X["HasBsmt"] = (X["TotalBsmtSF"].fillna(0) > 0).astype(int)
        if has("Fireplaces"):
            X["HasFireplace"] = (X["Fireplaces"].fillna(0) > 0).astype(int)

        # Interactions / Ratios
        if has("GrLivArea") and has("OverallQual"):
            X["QualGrLivArea"] = X["GrLivArea"].fillna(0) * X["OverallQual"].fillna(0)
        if has("BedroomAbvGr") and has("TotalBath"):
            X["BathPerBedroom"] = X["TotalBath"].replace(0, np.nan) / X["BedroomAbvGr"].replace(0, np.nan)

        # Cyclic month features
        if "MoSold" in X.columns:
            X["MoSold_sin"] = np.sin(2*np.pi*X["MoSold"]/12.0)
            X["MoSold_cos"] = np.cos(2*np.pi*X["MoSold"]/12.0)

        return X


class OrdinalMapper(BaseEstimator, TransformerMixin):
    """ Map ordered quality categories to integers. """
    def __init__(self):
        self.qual = {"None":0, "Po":1, "Fa":2, "TA":3, "Gd":4, "Ex":5}
        self.exposure = {"None":0, "No":1, "Mn":2, "Av":3, "Gd":4}
        self.bsmtfin = {"None":0, "Unf":1, "LwQ":2, "Rec":3, "BLQ":4, "ALQ":5, "GLQ":6}
        self.paved = {"N":0, "P":1, "Y":2}

    def _map(self, X, col, mapping):
        if col in X.columns:
            X[col] = X[col].fillna("None").map(mapping).astype("float64")

    def fit(self, X, y=None): return self

    def transform(self, X):
        X = X.copy()
        for c in ["ExterQual","ExterCond","BsmtQual","BsmtCond",
                  "HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC"]:
            self._map(X, c, self.qual)
        self._map(X, "BsmtExposure", self.exposure)
        for c in ["BsmtFinType1","BsmtFinType2"]:
            self._map(X, c, self.bsmtfin)
        self._map(X, "PavedDrive", self.paved)
        return X


class RareGrouper(BaseEstimator, TransformerMixin):
    """ Replace infrequent categories with 'Rare' to stabilize OHE. """
    def __init__(self, min_count=20):
        self.min_count = min_count
        self.keep_: Dict[str, set] = {}

    def fit(self, X, y=None):
        X = X.copy()
        for c in X.select_dtypes(include=["object"]).columns:
            vc = X[c].value_counts(dropna=False)
            self.keep_[c] = set(vc[vc >= self.min_count].index.astype(str))
        return self

    def transform(self, X):
        X = X.copy()
        for c, keep in self.keep_.items():
            if c in X.columns:
                X[c] = X[c].astype(str)
                X.loc[~X[c].isin(keep), c] = "Rare"
        return X


class SkewFixer(BaseEstimator, TransformerMixin):
    """ Log1p-transform skewed numeric features (except Year*, MoSold, YrSold). """
    def __init__(self, skew_threshold=0.75):
        self.skew_threshold = skew_threshold
        self.to_log_: List[str] = []

    def fit(self, X, y=None):
        X = X.copy()
        num_cols = X.select_dtypes(exclude=["object"]).columns
        exclude = [c for c in num_cols if c.startswith("Year")] + ["MoSold","YrSold"]
        cand = [c for c in num_cols if c not in exclude]
        skewness = X[cand].fillna(0).apply(lambda s: s.skew())
        self.to_log_ = [c for c, sk in skewness.items() if sk > self.skew_threshold]
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.to_log_:
            if c in X.columns:
                X[c] = np.log1p(X[c].clip(lower=0))
        return X


In [19]:
# ==== BLOCK 5: Preprocess + Feature Selection ====
winsor = Winsorizer(0.01, 0.99)
fe = FeatureEngineer()
ordmap = OrdinalMapper()
rares = RareGrouper(min_count=20)
skewfix = SkewFixer(0.75)

numeric_proc = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("power", PowerTransformer(method="yeo-johnson", standardize=False)),
])

categorical_proc = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric_proc, selector(dtype_exclude=["object"])),
        ("cat", categorical_proc, selector(dtype_include=["object"])),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# SelectFromModel (L1)
l1_selector = SelectFromModel(Lasso(alpha=0.01, max_iter=5000, random_state=RANDOM_STATE), threshold="median")

# RFE with Ridge
rfe_selector = RFE(estimator=Ridge(alpha=10.0, random_state=RANDOM_STATE), n_features_to_select=200, step=0.2)

# Optional PCA (after OHE) — keep moderate components
pca = PCA(n_components=200, random_state=RANDOM_STATE)

# End-to-end preprocessing with togglable selectors (we'll choose per-model)
base_pre = Pipeline(steps=[
    ("winsor", winsor),
    ("fe", fe),
    ("ord", ordmap),
    ("rare", rares),
    ("skew", skewfix),
    ("pre", pre),
    # Placeholders to be swapped per model:
    # ("sel", l1_selector) or ("sel", rfe_selector) or ("sel", "passthrough")
    # ("pca", pca) or "passthrough"
])

In [20]:
# ==== BLOCK 6: Target transform + CV ====
# We'll wrap each regressor with TransformedTargetRegressor to train on log1p target
def wrap_ttr(reg):
    return TransformedTargetRegressor(regressor=reg,
                                      func=np.log1p,
                                      inverse_func=np.expm1)

# StratifiedKFold on binned (log) target for balanced folds
y_bins = pd.qcut(np.log1p(y), q=10, labels=False, duplicates="drop")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


In [21]:
# ==== BLOCK 7: Models & grids ====
models = {}

# Linear / Elastic
models["Ridge_l1sel"] = Pipeline([("pre", clone(base_pre)), ("sel", l1_selector), ("reg", wrap_ttr(Ridge(random_state=RANDOM_STATE)))])
models["Elastic_rfe"] = Pipeline([("pre", clone(base_pre)), ("sel", rfe_selector), ("reg", wrap_ttr(ElasticNet(max_iter=5000, random_state=RANDOM_STATE)))])

# Tree ensembles (selector often "passthrough")
models["RF"] = Pipeline([("pre", clone(base_pre)), ("sel", "passthrough"), ("reg", wrap_ttr(RandomForestRegressor(random_state=RANDOM_STATE)) )])
models["GB"] = Pipeline([("pre", clone(base_pre)), ("sel", "passthrough"), ("reg", wrap_ttr(GradientBoostingRegressor(random_state=RANDOM_STATE)) )])

# Optional boosters
if xgb is not None:
    models["XGB"] = Pipeline([("pre", clone(base_pre)), ("sel", "passthrough"),
                              ("reg", wrap_ttr(xgb.XGBRegressor(
                                  objective="reg:squarederror", random_state=RANDOM_STATE, tree_method="hist"
                              )))])
if lgb is not None:
    models["LGBM"] = Pipeline([("pre", clone(base_pre)), ("sel", "passthrough"),
                               ("reg", wrap_ttr(lgb.LGBMRegressor(random_state=RANDOM_STATE)))])
if CatBoostRegressor is not None:
    models["CAT"] = Pipeline([("pre", clone(base_pre)), ("sel", "passthrough"),
                              ("reg", wrap_ttr(CatBoostRegressor(
                                  loss_function="RMSE", random_state=RANDOM_STATE, verbose=0
                              )))])


In [22]:
# ==== BLOCK 8: Hyperparameter tuning ====
results = {}
best_pipes = {}

def evaluate_pipe(name, pipe, param_grid=None, search="grid", n_iter=25):
    if param_grid is None:
        # default safe params
        param_grid = {}
    if search == "random":
        searcher = RandomizedSearchCV(pipe, param_distributions=param_grid, n_iter=n_iter,
                                      scoring=rmsle_scorer, cv=cv, n_jobs=-1, verbose=1, random_state=RANDOM_STATE)
    else:
        searcher = GridSearchCV(pipe, param_grid=param_grid, scoring=rmsle_scorer, cv=cv, n_jobs=-1, verbose=1)
    searcher.fit(X, y_bins)  # use y_bins only for CV splitting; scoring uses y internally
    # we need to refit with original y for scoring — GridSearchCV already refit on full X,y (wrapped TTR handles y)
    best_rmsle = -searcher.best_score_
    results[name] = {"Best RMSLE": best_rmsle, "Best Params": searcher.best_params_}
    best_pipes[name] = searcher.best_estimator_

# Param grids
grid_ridge = {
    "sel": [l1_selector, "passthrough"],
    "reg__regressor__alpha": [0.1, 1.0, 10.0, 50.0],
}
grid_elastic = {
    "sel": [rfe_selector, l1_selector, "passthrough"],
    "reg__regressor__alpha": [0.001, 0.01, 0.1, 1.0],
    "reg__regressor__l1_ratio": [0.2, 0.5, 0.8],
}
grid_rf = {
    "reg__regressor__n_estimators": [400, 700],
    "reg__regressor__max_depth": [None, 12, 20],
    "reg__regressor__min_samples_leaf": [1, 3, 5],
}
grid_gb = {
    "reg__regressor__n_estimators": [400, 800],
    "reg__regressor__learning_rate": [0.03, 0.05, 0.1],
    "reg__regressor__max_depth": [2, 3, 5],
    "reg__regressor__subsample": [0.8, 1.0],
    "reg__regressor__min_samples_leaf": [1, 3, 5],
}

# Run searches
evaluate_pipe("Ridge_l1sel", models["Ridge_l1sel"], grid_ridge, search="grid")
evaluate_pipe("Elastic_rfe", models["Elastic_rfe"], grid_elastic, search="grid")
evaluate_pipe("RF", models["RF"], grid_rf, search="random", n_iter=16)
evaluate_pipe("GB", models["GB"], grid_gb, search="grid")

# XGB/LGBM/CAT
if xgb is not None:
    grid_xgb = {
        "reg__regressor__n_estimators": [800, 1200],
        "reg__regressor__max_depth": [3, 4, 5],
        "reg__regressor__learning_rate": [0.03, 0.05, 0.1],
        "reg__regressor__subsample": [0.8, 1.0],
        "reg__regressor__colsample_bytree": [0.6, 0.8, 1.0]
    }
    evaluate_pipe("XGB", models["XGB"], grid_xgb, search="random", n_iter=18)

if lgb is not None and optuna is not None:
    # Optuna tuning for LGBM
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 600, 1400),
            "num_leaves": trial.suggest_int("num_leaves", 16, 64),
            "max_depth": trial.suggest_int("max_depth", 3, 8),
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.12),
            "subsample": trial.suggest_float("subsample", 0.7, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 25),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 0.5),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 0.5),
            "random_state": RANDOM_STATE
        }
        pipe = clone(models["LGBM"])
        # Correctly set parameters for the regressor within the pipeline
        pipe.set_params(**{"reg__regressor__" + key: value for key, value in params.items()})
        # cross_val_predict for scoring with original y
        preds = cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1, verbose=0, method="predict")
        return rmsle(y, preds)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=25, show_progress_bar=False)
    lgb_best = clone(models["LGBM"]).set_params(**{"reg__regressor__" + key: value for key, value in study.best_params.items()})
    preds = cross_val_predict(lgb_best, X, y, cv=cv, n_jobs=-1, verbose=0, method="predict")
    results["LGBM_Optuna"] = {"Best RMSLE": rmsle(y, preds), "Best Params": study.best_params}
    best_pipes["LGBM_Optuna"] = lgb_best
elif lgb is not None:
    grid_lgb = {
        "reg__regressor__n_estimators": [800, 1200],
        "reg__regressor__learning_rate": [0.03, 0.06, 0.1],
        "reg__regressor__max_depth": [3, 5, 7],
        "reg__regressor__num_leaves": [31, 47, 63],
        "reg__regressor__subsample": [0.8, 1.0],
        "reg__regressor__colsample_bytree": [0.7, 1.0]
    }
    evaluate_pipe("LGBM", models["LGBM"], grid_lgb, search="random", n_iter=18)

if CatBoostRegressor is not None:
    grid_cat = {
        "reg__regressor__depth": [4, 6, 8],
        "reg__regressor__learning_rate": [0.03, 0.06, 0.1],
        "reg__regressor__n_estimators": [800, 1200],
        "reg__regressor__l2_leaf_reg": [3, 5, 7],
    }
    evaluate_pipe("CAT", models["CAT"], grid_cat, search="random", n_iter=16)

# Show tuning summary
summary = (pd.DataFrame(results).T).sort_values("Best RMSLE")
print("\n==== MODEL RANKING (lower RMSLE is better) ====")
print(summary)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[I 2025-09-04 14:37:22,682] A new study created in memory with name: no-name-278b0750-a727-437c-85e9-36bcae2b5148
[I 2025-09-04 14:37:32,691] Trial 0 finished with value: 0.1357705370165879 and parameters: {'n_estimators': 1001, 'num_leaves': 18, 'max_depth': 6, 'learning_rate': 0.1102019448431262, 'subsample': 0.994793153776766, 'colsample_bytree': 0.7360521790854737, 'min_child_samples': 5, 'reg_alpha': 0.36876671003369427, 'reg_lambda': 0.3945170901736121}. Best is trial 0 with value: 0.1357705370165879.
[I 2025-09-04 14:37:42,667] Trial 1 finished with value: 0.13611743415545083 and parameters: {'n_estimators': 883, 'num_leaves': 59, 'max_depth': 8, 'learning_rate': 0.02830572843895605, 'subsample': 0.8199177460650445, 'colsample_bytree': 0.9085486314727231, 'min_child_samples': 12, 'reg_alpha': 0.3270583679868257, 'reg_lambda': 0.019609910502172423}. Best is trial 0 with value: 0.1357705370165879.
[I 2025-09-04 14:37:53,654] Trial 2 finished with value: 0.1341701576545262 and para

Fitting 5 folds for each of 16 candidates, totalling 80 fits

==== MODEL RANKING (lower RMSLE is better) ====
            Best RMSLE                                        Best Params
LGBM_Optuna   0.130172  {'n_estimators': 718, 'num_leaves': 35, 'max_d...
CAT           0.244365  {'reg__regressor__n_estimators': 1200, 'reg__r...
XGB           0.248817  {'reg__regressor__subsample': 0.8, 'reg__regre...
GB            0.251124  {'reg__regressor__learning_rate': 0.05, 'reg__...
RF            0.274468  {'reg__regressor__n_estimators': 400, 'reg__re...
Elastic_rfe   0.277408  {'reg__regressor__alpha': 0.001, 'reg__regress...
Ridge_l1sel   0.278795  {'reg__regressor__alpha': 10.0, 'sel': 'passth...


In [None]:
# ==== BLOCK 9: Stacking the best models ====
# pick top 3-4 models by RMSLE
top_names = list((pd.DataFrame(results).T).sort_values("Best RMSLE").index)[:4]
estimators = [(n, best_pipes[n]) for n in top_names if n in best_pipes]

# Meta model (Ridge on log target via TTR)
meta = wrap_ttr(Ridge(alpha=1.0, random_state=RANDOM_STATE))

stack = StackingRegressor(
    estimators=estimators,
    final_estimator=meta,
    passthrough=False,
    n_jobs=-1
)

# CV estimate for stack
stack_preds = cross_val_predict(stack, X, y, cv=cv, n_jobs=-1, verbose=0, method="predict")
stack_rmsle = rmsle(y, stack_preds)
print(f"\nSTACK ({', '.join([n for n,_ in estimators])}) RMSLE (CV): {stack_rmsle:.6f}")


In [None]:
# ==== BLOCK 10: Final fit & quick predictions ====
# choose the single best model or the stack (whichever is better)
best_single_name = summary.index[0]
best_single_score = summary.iloc[0]["Best RMSLE"]
use_stack = stack_rmsle < best_single_score

if use_stack:
    final_model = stack
    model_name = "STACK"
    final_model.fit(X, y)
else:
    model_name = best_single_name
    final_model = best_pipes[best_single_name]
    final_model.fit(X, y)

print(f"\nFinal chosen model: {model_name}")
preds_train = final_model.predict(X)
print("First 10 predictions:", np.round(preds_train[:10], 2))
print("Final RMSLE on full training (optimistic):", rmsle(y, preds_train))


In [25]:
# ==== BLOCK 11: Test Prediction & Submission ====
# Load test set
TEST_FILE_PATH = r"/content/test (1).csv"   # adjust path
df_test = pd.read_csv(TEST_FILE_PATH)

# Keep Id for submission
test_ids = df_test["Id"].values if "Id" in df_test.columns else np.arange(len(df_test))

# Align columns (if some engineered features don't exist in test, pipeline will create them)
# Just ensure df_test has the same base structure as training
X_test = df_test.copy()

# Predict using the trained pipeline
test_preds = final_model.predict(X_test)

# Clip to avoid negatives (for prices)
test_preds = np.maximum(test_preds, 0)

# Build submission DataFrame
sub = pd.DataFrame({
    "Id": test_ids,
    target_col: np.round(test_preds, 2)   # round if needed
})

# Save submission file
OUTPUT_PATH = "submission.csv"
sub.to_csv(OUTPUT_PATH, index=False)

print(f"\n✅ Submission file saved: {OUTPUT_PATH}")
print(sub.head())



✅ Submission file saved: submission.csv
     Id  SalePrice
0  1461  116537.23
1  1462  158022.13
2  1463  190392.18
3  1464  196491.45
4  1465  189646.66
