### ENGENEERING FEATURES

In [None]:
import pandas as pd

def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Total square footage
    df["TotalSF"] = (
        df.get("TotalBsmtSF", 0).fillna(0)
        + df.get("1stFlrSF", 0).fillna(0)
        + df.get("2ndFlrSF", 0).fillna(0)
    )

    # Total bathrooms (full + 0.5*half) including basement
    df["TotalBath"] = (
        df.get("FullBath", 0).fillna(0)
        + 0.5 * df.get("HalfBath", 0).fillna(0)
        + df.get("BsmtFullBath", 0).fillna(0)
        + 0.5 * df.get("BsmtHalfBath", 0).fillna(0)
    )

    # Porch and deck area
    df["TotalPorchSF"] = (
        df.get("OpenPorchSF", 0).fillna(0)
        + df.get("EnclosedPorch", 0).fillna(0)
        + df.get("3SsnPorch", 0).fillna(0)
        + df.get("ScreenPorch", 0).fillna(0)
        + df.get("WoodDeckSF", 0).fillna(0)
    )

    # Age related features
    df["HouseAge"] = df.get("YrSold", 0).fillna(0) - df.get("YearBuilt", 0).fillna(0)
    df["SinceRemodel"] = df.get("YrSold", 0).fillna(0) - df.get("YearRemodAdd", 0).fillna(0)
    # If garage year missing, back-fill with YearBuilt, then compute age
    garage_year = df.get("GarageYrBlt", pd.Series(index=df.index, dtype=float)).copy()
    built_year = df.get("YearBuilt", 0).fillna(0)
    garage_year = garage_year.fillna(built_year)
    df["SinceGarage"] = df.get("YrSold", 0).fillna(0) - garage_year

    # Binary indicators
    df["HasPool"] = (df.get("PoolArea", 0).fillna(0) > 0).astype(int)
    df["Has2ndFloor"] = (df.get("2ndFlrSF", 0).fillna(0) > 0).astype(int)
    df["HasBsmt"] = (df.get("TotalBsmtSF", 0).fillna(0) > 0).astype(int)
    df["HasGarage"] = (df.get("GarageArea", 0).fillna(0) > 0).astype(int)

    # Interactions
    if "OverallQual" in df.columns and "GrLivArea" in df.columns:
        df["OverallQual_GrLivArea"] = df["OverallQual"].fillna(0) * df["GrLivArea"].fillna(0)

    # Treat MSSubClass as categorical
    if "MSSubClass" in df.columns:
        df["MSSubClass"] = df["MSSubClass"].astype(str)

    # Month and Year as categorical (strings)
    if "MoSold" in df.columns:
        df["MoSold"] = df["MoSold"].astype(str)
    if "YrSold" in df.columns:
        df["YrSold"] = df["YrSold"].astype(str)

    return df

### FILL NA

In [None]:
def fill_domain_na(df_all: pd.DataFrame) -> pd.DataFrame:
    df = df_all.copy()

    # Columns where NA means "None"
    none_cols = [
        "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
        "GarageType", "GarageFinish", "GarageQual", "GarageCond",
        "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
        "MasVnrType"
    ]
    for col in none_cols:
        if col in df.columns:
            df[col] = df[col].fillna("None")

    # LotFrontage: fill by neighborhood median
    if "LotFrontage" in df.columns and "Neighborhood" in df.columns:
        df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
            lambda s: s.fillna(s.median())
        )

    # Garage Yr Built: default to YearBuilt
    if "GarageYrBlt" in df.columns and "YearBuilt" in df.columns:
        df["GarageYrBlt"] = df["GarageYrBlt"].fillna(df["YearBuilt"])

    # MasVnrArea: 0 when type is None
    if "MasVnrArea" in df.columns and "MasVnrType" in df.columns:
        df.loc[df["MasVnrType"].eq("None"), "MasVnrArea"] = df.loc[
            df["MasVnrType"].eq("None"), "MasVnrArea"
        ].fillna(0)

    # Mode fills for a set of common categoricals
    mode_fills: Dict[str, str] = {
        "MSZoning": "RL",
        "Functional": "Typ",
        "Electrical": "SBrkr",
        "KitchenQual": "TA",
        "Exterior1st": "VinylSd",
        "Exterior2nd": "VinylSd",
        "SaleType": "WD",
        "Utilities": "AllPub",
    }
    for col, default in mode_fills.items():
        if col in df.columns:
            df[col] = df[col].fillna(default)

    # Remaining numerics: median; remaining categoricals: "Unknown"
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
    for col in numeric_columns:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    for col in categorical_columns:
        if df[col].isna().any():
            df[col] = df[col].fillna("Unknown")

    return df


### MAPEO CATEGÓRICAS

In [None]:
def map_ordinal_categories(df_all: pd.DataFrame) -> pd.DataFrame:
    df = df_all.copy()

    qual_map = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
    exp_map = {"No": 0, "Mn": 1, "Av": 2, "Gd": 3, "None": 0}
    fin_map = {"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6, "None": 0}
    func_map = {"Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8}
    pave_map = {"N": 0, "P": 1, "Y": 2}

    ordinal_specs: List[Tuple[str, Dict[str, int]]] = [
        ("ExterQual", qual_map), ("ExterCond", qual_map), ("BsmtQual", qual_map), ("BsmtCond", qual_map),
        ("HeatingQC", qual_map), ("KitchenQual", qual_map), ("FireplaceQu", qual_map),
        ("GarageQual", qual_map), ("GarageCond", qual_map), ("PoolQC", qual_map),
        ("BsmtExposure", exp_map),
        ("BsmtFinType1", fin_map), ("BsmtFinType2", fin_map),
        ("Functional", func_map), ("PavedDrive", pave_map),
    ]

    for col, mapper in ordinal_specs:
        if col in df.columns:
            df[col] = df[col].map(mapper).fillna(0).astype(int)

    return df

### PREPROCESSOR

In [None]:
def build_preprocessor(df_all: pd.DataFrame) -> ColumnTransformer:
    numeric_features = df_all.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df_all.select_dtypes(include=["object"]).columns.tolist()

    # Categorical encoder configuration compatible across sklearn versions
    try:
        categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

    numeric_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("power", PowerTransformer(method="yeo-johnson", standardize=True)),
        ]
    )

    categorical_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", categorical_encoder),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_features),
            ("cat", categorical_pipeline, categorical_features),
        ]
    )

    return preprocessor