# PREPROCESSING PIPELINE FOR LINEAR REGRESSION MODEL

Preprocessing pipeline to get a fully transformed (imputed - encoded - scaled) X_train_final and X_test_final. Keeps original X_train and X_test intact.
DataFrames containing ALL new columns, concatenated together cleanly and safely to avoid data leakage

## 1. Imputation

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

# --- NUMERICAL IMPUTATION (mean) ---
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

num_imputer = SimpleImputer(strategy="mean")

X_train_num = pd.DataFrame(
    num_imputer.fit_transform(X_train[num_cols]),
    columns=num_cols,
    index=X_train.index
)

X_test_num = pd.DataFrame(
    num_imputer.transform(X_test[num_cols]),
    columns=num_cols,
    index=X_test.index
)

# --- CATEGORICAL IMPUTATION FOR state_of_building ---
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()

X_train_cat["state_of_building"] = X_train_cat["state_of_building"].fillna("unknown")
X_test_cat["state_of_building"] = X_test_cat["state_of_building"].fillna("unknown")


## 2. Encoding

### One Hot Encoding for 'type' column

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe_type = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

type_train = ohe_type.fit_transform(X_train_cat[["type"]])
type_test = ohe_type.transform(X_test_cat[["type"]])

type_train_df = pd.DataFrame(
    type_train,
    columns=ohe_type.get_feature_names_out(["type"]),
    index=X_train.index
)

type_test_df = pd.DataFrame(
    type_test,
    columns=ohe_type.get_feature_names_out(["type"]),
    index=X_test.index
)


### One Hot Encoding for "province" column

In [None]:
ohe_province = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

province_train = ohe_province.fit_transform(X_train_cat[["province"]])
province_test = ohe_province.transform(X_test_cat[["province"]])

province_train_df = pd.DataFrame(
    province_train,
    columns=ohe_province.get_feature_names_out(["province"]),
    index=X_train.index
)

province_test_df = pd.DataFrame(
    province_test,
    columns=ohe_province.get_feature_names_out(["province"]),
    index=X_test.index
)


### LabelEncoding for "subtype" column

In [None]:
from sklearn.preprocessing import LabelEncoder

le_subtype = LabelEncoder()

subtype_train = le_subtype.fit_transform(X_train_cat["subtype"])
subtype_test = le_subtype.transform(X_test_cat["subtype"])

subtype_train_df = pd.DataFrame({"subtype_le": subtype_train}, index=X_train.index)
subtype_test_df = pd.DataFrame({"subtype_le": subtype_test}, index=X_test.index)


### OrdinalEncoder for "state_of_building" column

In [None]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# 1. Define the custom order
state_order = [
    [
        "unknown",
        "To demolish",
        "Under construction",
        "To restore",
        "To renovate",
        "To be renovated",
        "Normal",
        "Fully renovated",
        "Excellent",
        "New"
    ]
]

# 2. Create the encoder
ord_enc = OrdinalEncoder(categories=state_order)

# 3. Fit on training data only
state_train = ord_enc.fit_transform(X_train_cat[["state_of_building"]])
state_test = ord_enc.transform(X_test_cat[["state_of_building"]])

# 4. Convert to DataFrames
# Why flatten()? OrdinalEncoder returns a 2-D array with shape (n_rows, 1). But a DataFrame column needs a 1-D array.
# So this turns it into a proper single column
state_train_df = pd.DataFrame(
    {"state_oe": state_train.flatten()},
    index=X_train.index
)

state_test_df = pd.DataFrame(
    {"state_oe": state_test.flatten()},
    index=X_test.index
)


## 3. Scaling (Standardization)

In [None]:
from sklearn.preprocessing import StandardScaler

scale_cols = ["living_area (m²)", "number_of_bedrooms", "number_facades", "terrace_area (m²)"]

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_num[scale_cols]),
    columns=[col + "_scaled" for col in scale_cols],
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_num[scale_cols]),
    columns=[col + "_scaled" for col in scale_cols],
    index=X_test.index
)


## 4. Build the final DataFrame

We combine:
all imputed numerical columns
encoded "type"
encoded "province"
encoded "subtype"
encoded "state of building"
scaled numerical columns
ALL remaining original categorical columns (including state_of_building if needed)

In [None]:
# Drop the original columns that were encoded or scaled
drop_cols = ["type", "province", "subtype", "state_of_building"] + scale_cols

X_train_base = X_train_cat.drop(columns=drop_cols)
X_test_base = X_test_cat.drop(columns=drop_cols)

# Build final DataFrames
X_train_final = pd.concat([
    X_train_base,
    X_train_num,         # imputed original numericals
    type_train_df,
    province_train_df,
    subtype_train_df,
    state_train_df,
    X_train_scaled
], axis=1)

X_test_final = pd.concat([
    X_test_base,
    X_test_num,
    type_test_df,
    province_test_df,
    subtype_test_df,
    state_test_df,
    X_test_scaled
], axis=1)

# Show result
X_train_final.head()


# SINGLE UNIFIED SKLEARN PIPELINE WITH A COLUMNTRANSFORMER 

This pipeline will automatically:
* impute numeric columns (mean)
* impute categorical columns (“state_of_building” → “unknown”)
* OrdinalEncode state_of_building using your custom order
* OneHotEncode type + province
* LabelEncode subtype (inside a custom transformer)
* scale selected numeric columns
* output a fully-transformed DataFrame with all new columns
* This is the clean, robust, and reusable solution.

In [None]:
"""1) Custom LabelEncoder Transformer
Sklearn’s LabelEncoder only works for 1-D arrays, so we wrap it:"""

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import pandas as pd

class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
        self.le = LabelEncoder()

    def fit(self, X, y=None):
        self.le.fit(X[self.column_name])
        return self

    def transform(self, X):
        X_out = X.copy()
        X_out[self.column_name + "_le"] = self.le.transform(X[self.column_name])
        return X_out[[self.column_name + "_le"]]


In [None]:
"""Define column groups"""
num_cols = ["living_area (m²)", "number_of_bedrooms", "number_facades", "terrace_area (m²)"]
ohe_cols = ["type", "province"]
ord_col = ["state_of_building"]
label_col = ["subtype"]


In [None]:
"""3. Create the ColumnTransformer and Full Pipeline"""
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Custom order for state_of_building
state_order = [
    ["unknown", "To demolish", "Under construction", "To restore",
     "To renovate", "To be renovated", "Normal", "Fully renovated",
     "Excellent", "New"]
]

# ---- Pipelines ----

# Numeric pipeline (mean impute + scale)
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# OneHotEncode pipeline for type + province
ohe_pipeline = Pipeline(steps=[
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# OrdinalEncode pipeline for state_of_building
ord_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("ordinal", OrdinalEncoder(categories=state_order))
])

# LabelEncoder pipeline for subtype (custom transformer)
label_pipeline = Pipeline(steps=[
    ("labelenc", LabelEncoderTransformer("subtype"))
])

# ---- Combine into ColumnTransformer ----

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("ohe", ohe_pipeline, ohe_cols),
        ("ord", ord_pipeline, ord_col),
        ("label", label_pipeline, label_col),
    ],
    remainder="drop"
)

# Full pipeline
full_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor)
])


In [None]:
"""4. Fit and Transform: This returns a NumPy array — we convert it to a DataFrame with correct column names."""
X_train_transformed = full_pipeline.fit_transform(X_train)
X_test_transformed = full_pipeline.transform(X_test)

In [None]:
"""5. Output Column Names
We reconstruct readable names:"""
# 1. Numeric scaled columns
num_features = num_cols

# 2. One-hot encoded columns
ohe_features = full_pipeline.named_steps["preprocessing"] \
    .named_transformers_["ohe"] \
    .named_steps["ohe"] \
    .get_feature_names_out(ohe_cols)

# 3. Ordinal encoded
ord_features = ["state_of_building_oe"]

# 4. Label encoded
label_features = ["subtype_le"]

# Combine
all_features = list(num_features) + \
               list(ohe_features) + \
               ord_features + \
               label_features


In [None]:
"""6. Final DataFrames"""
X_train_final = pd.DataFrame(X_train_transformed, columns=all_features, index=X_train.index)
X_test_final = pd.DataFrame(X_test_transformed, columns=all_features, index=X_test.index)