### Basic setup + Load Dataset

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")
df.head()


Unnamed: 0,property_ID,locality_name,postal_code,type,subtype,price (€),number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",state_of_building,province
0,RBU60880,Maurits Sabbestraat 4 202,2800,Apartment,Apartment,329000,3.0,104,0,0,0,1,,0,2.0,0,Excellent,Antwerp
1,RBU61001,Nieuwstraat 13,2200,House,Residence,425000,3.0,378,0,0,0,1,,1,2.0,0,To be renovated,Antwerp
2,RBU62593,Veerstraat,2840,Apartment,Apartment,264700,1.0,69,0,0,0,1,25.0,0,,0,,Antwerp
3,RBU60705,Winkelomseheide 158,2440,Apartment,Apartment,290000,2.0,95,0,0,0,1,15.0,1,2.0,0,New,Antwerp
4,RBU60944,Generaal van der Meerschstraat 85 2,2300,Apartment,Apartment,180000,2.0,88,0,0,0,1,2.0,0,2.0,0,Normal,Antwerp


In [31]:
# Overview of all the columns and the total missing values per column
df.isna().sum()

property_ID                         0
locality_name                    2328
postal_code                         0
type                                0
subtype                             0
price (€)                           0
number_of_bedrooms                148
living_area (m²)                    0
equiped_kitchen (yes:1, no:0)       0
furnished (yes:1, no:0)             0
open_fire (yes:1, no:0)             0
terrace (yes:1, no:0)               0
terrace_area (m²)                6750
garden (yes:1, no:0)                0
number_facades                   4197
swimming_pool (yes:1, no:0)         0
state_of_building                2913
province                            0
dtype: int64

In [32]:
# see the categories and total count per category for "subtype" and "state_of_building" column
#df["subtype"].value_counts()
df["state_of_building"].value_counts()

state_of_building
New                   5748
Normal                2657
Excellent             1312
To be renovated       1215
To renovate            404
Fully renovated        275
Under construction       8
To restore               8
To demolish              5
Name: count, dtype: int64

In [33]:

# get categorical columns:
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
categorical_cols

# get numerical columns:
#numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
#numeric_cols



['property_ID',
 'locality_name',
 'type',
 'subtype',
 'state_of_building',
 'province']

### Clean X selection

In [34]:
# Columns to drop
columns_to_drop = [
    "price (€)",
    "property_ID",                # identifier column/indirect leakage
    "locality_name",              # high cardinality (too many categories), not useful for basic ML
    "postal_code",                # high cardinality unless encoded with one-hot encoding
]

# also column to drop but not included in this csv-file: "price_per_square_meter" because it leaks price

X = df.drop(columns=columns_to_drop)

y = df["price (€)"]

### Train-test split code

In [35]:
# split X and y before imputing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=777
)

## Preprocessing: imputation, encoding, standardization

### 1) Impute numerical columns with missing values 

In [36]:
# mean imputation for missing values for numeric columns using the training set mean

from sklearn.impute import SimpleImputer

# select the columns in X_train with datatype "int" and "float" and extract those column names with .columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

# create an imputer object that will replace missing values (NaN) with the mean of each column
num_imputer = SimpleImputer(strategy="mean")

# .fit: calculate the mean of each numerical column using only the training data
# transform: replace NaNs in X_train[num_cols] with the computed means
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])

# apply the same means learned from X_train to the test set
# do not recompute the mean on the test set (this prevents data leakage!)
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

### Impute missing values in categorical column with "unknown" category

In [37]:
"""For state_of_building column: impute with a new category: "Unknown" = safest and most interpretable approach because:
- you don't invent information that isn't there
- models like tree-based algorithms (RandomForest, XGBoost) can learn whether "unknown" is predictive
- It preserves the missingness pattern, which often is informative."""

# select the column named "state_of_building" from the training dataset X_train, the method .fillna() replaces all NaN (missing) values
X_train['state_of_building'] = X_train['state_of_building'].fillna("unknown")

# apply the same cleaning operation to the test set X_test
X_test['state_of_building'] = X_test['state_of_building'].fillna("unknown")

### 2) Encoding: Converting categorical data into numeric features with encoding

#### One-Hot Encoding for "type" and "province" column

In [None]:
from sklearn.preprocessing import OneHotEncoder
# "type" column:
# create an instance of the OneHotEncoder class that will convert categories into binary columns, 
# handle_unknown="ignore": prevents errors raised from test set and sparse_output =False produces a normal NumPy array instead of a memory-efficient sparse matrix
ohe_type = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# fit only on train
type_train = ohe_type.fit_transform(X_train[["type"]])
type_test = ohe_type.transform(X_test[["type"]])

# convert the encoded arrays back into DataFrames
type_train_df = pd.DataFrame(type_train, columns=ohe_type.get_feature_names_out(["type"]), index=X_train.index)
type_test_df = pd.DataFrame(type_test, columns=ohe_type.get_feature_names_out(["type"]), index=X_test.index)

In [39]:
from sklearn.preprocessing import OneHotEncoder
# same for "province" column:
ohe_province = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

province_train = ohe_province.fit_transform(X_train[["province"]])
province_test = ohe_province.transform(X_test[["province"]])

province_train_df = pd.DataFrame(province_train, columns=ohe_province.get_feature_names_out(["province"]), index=X_train.index)
province_test_df = pd.DataFrame(province_test, columns=ohe_province.get_feature_names_out(["province"]), index=X_test.index)

#### LabelEncoder for "subtype" column

In [40]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# "subtype" column:

# create LabelEncoder object
le_subtype = LabelEncoder()

# LabelEncoder returns 1D array of integers so you can directly assign it into a DataFrame column
# do not fit on X_test to avoid data leakage: test gets transformed using the mapping already learned
X_train["subtype_le"] = le_subtype.fit_transform(X_train["subtype"])
X_test["subtype_le"] = le_subtype.transform(X_test["subtype"])

### OrdinalEncoder for "state_of_building" column

In [None]:
# Get a list of the unique categories in the "state_of_building" column
X_train["state_of_building"].unique()


array(['New', 'Excellent', 'Normal', 'unknown', 'To be renovated',
       'Fully renovated', 'To renovate', 'To restore', 'To demolish',
       'Under construction'], dtype=object)

In [45]:
from sklearn.preprocessing import OrdinalEncoder

state_order = [
    ["unknown", "To demolish", "Under construction", "To restore", "To renovate", "To be renovated", "Normal", "Fully renovated", "Excellent", "New"]
]

ord_enc = OrdinalEncoder(categories=state_order)

X_train["state_oe"] = ord_enc.fit_transform(X_train[["state_of_building"]])
X_test["state_oe"] = ord_enc.transform(X_test[["state_of_building"]])


### 3) Standardization (Feature scaling) for continuous numerical columns (not the encoded ones)

The correct way (to avoid leakage):
StandardScaler must be fit only on X_train and applied to both train and test with the same learned parameters

In [43]:
from sklearn.preprocessing import StandardScaler

# manually list the columns you want to standardize
# another option for the future: automatic detection of numerical columns
num_cols = ["living_area (m²)", "number_of_bedrooms", "number_facades", "terrace_area (m²)"]

scaler = StandardScaler()

# Fit only on train (no leakage)
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

# Transform test using the same scaler
X_test[num_cols] = scaler.transform(X_test[num_cols])

### 4) Build final training and test DataFrames: assemble all transformed features into a final dataset
To be able to see the results of the preprocessing steps that were applied to X_train and X_test, we need to manually combine the transformed arrays back into a DataFrame (see preprocessing_notebook.ipynb for the code, for now skip this part) 

In [None]:
""" This step is crucial — you cannot train on X_train directly because:
some columns were encoded → new DataFrames exist (type_train_df, province_train_df, etc.)
some columns were dropped (like type, province, state_of_building, etc.)
some columns were scaled"""

# 1. Columns to remove (original categorical features now encoded)
drop_cols = ["type", "province", "subtype", "state_of_building"]

# 2. Base = all columns not encoded + already imputed + already scaled
X_train_base = X_train.drop(columns=drop_cols)
X_test_base = X_test.drop(columns=drop_cols)

# 3. Add all encoded & scaled feature DataFrames
X_train_final = pd.concat([
    X_train_base,
    type_train_df,
    province_train_df,
    subtype_train_df,
    state_train_df,     # from OrdinalEncoding
], axis=1)

X_test_final = pd.concat([
    X_test_base,
    type_test_df,
    province_test_df,
    subtype_test_df,
    state_test_df,
], axis=1)

print("Final feature count:", X_train_final.shape[1])
X_train_final.head()



## Model training

### Linear Regression Model