### Basic setup + Load Dataset

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [83]:
df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")
df.head()


Unnamed: 0,property_ID,locality_name,postal_code,type,subtype,price (€),number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",state_of_building,province
0,RBU60880,Maurits Sabbestraat 4 202,2800,Apartment,Apartment,329000,3.0,104,0,0,0,1,,0,2.0,0,Excellent,Antwerp
1,RBU61001,Nieuwstraat 13,2200,House,Residence,425000,3.0,378,0,0,0,1,,1,2.0,0,To be renovated,Antwerp
2,RBU62593,Veerstraat,2840,Apartment,Apartment,264700,1.0,69,0,0,0,1,25.0,0,,0,,Antwerp
3,RBU60705,Winkelomseheide 158,2440,Apartment,Apartment,290000,2.0,95,0,0,0,1,15.0,1,2.0,0,New,Antwerp
4,RBU60944,Generaal van der Meerschstraat 85 2,2300,Apartment,Apartment,180000,2.0,88,0,0,0,1,2.0,0,2.0,0,Normal,Antwerp


In [None]:
# Overview of all the columns and the total missing values per column
#df.isna().sum()

property_ID                         0
locality_name                    2328
postal_code                         0
type                                0
subtype                             0
price (€)                           0
number_of_bedrooms                148
living_area (m²)                    0
equiped_kitchen (yes:1, no:0)       0
furnished (yes:1, no:0)             0
open_fire (yes:1, no:0)             0
terrace (yes:1, no:0)               0
terrace_area (m²)                6750
garden (yes:1, no:0)                0
number_facades                   4197
swimming_pool (yes:1, no:0)         0
state_of_building                2913
province                            0
dtype: int64

In [None]:
# see the categories and total count per category for "subtype" and "state_of_building" column
#df["subtype"].value_counts()
#df["state_of_building"].value_counts()

subtype
Apartment         6592
Residence         5780
Villa              518
Ground floor       356
Penthouse          308
Duplex             248
Mixed building     230
Studio             186
Chalet              76
Master house        69
Bungalow            64
Cottage             50
Loft                38
Triplex             24
Mansion              6
Name: count, dtype: int64

In [None]:

# get categorical columns:
#categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
#categorical_cols

# get numerical columns:
#numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
#numeric_cols



['property_ID',
 'locality_name',
 'type',
 'subtype',
 'state_of_building',
 'province']

### Clean X selection

In [84]:
# Columns to drop
columns_to_drop = [
    "price (€)",
    "property_ID",                # identifier column/indirect leakage
    "locality_name",              # high cardinality (too many categories), not useful for basic ML
    "postal_code",                # high cardinality unless encoded with one-hot encoding
]

# also column to drop but not included in this csv-file: "price_per_square_meter" because it leaks price

X = df.drop(columns=columns_to_drop)

y = df["price (€)"]

### Train-test split code

In [85]:
# split X and y before imputing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=777
)


## Preprocessing: imputation, encoding, standardization

### 1) Impute numerical columns with missing values 

In [86]:
# mean imputation for missing values for numeric columns using the training set mean

from sklearn.impute import SimpleImputer

# select the columns in X_train with datatype "int" and "float" and extract those column names with .columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

# create an imputer object that will replace missing values (NaN) with the mean of each column
num_imputer = SimpleImputer(strategy="mean")

# .fit: calculate the mean of each numerical column using only the training data
# transform: replace NaNs in X_train[num_cols] with the computed means
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])

# apply the same means learned from X_train to the test set
# do not recompute the mean on the test set (this prevents data leakage!)
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

### Impute missing values in categorical column with "unknown" category

In [87]:
"""For state_of_building column: impute with a new category: "Unknown" = safest and most interpretable approach because:
- you don't invent information that isn't there
- models like tree-based algorithms (RandomForest, XGBoost) can learn whether "unknown" is predictive
- It preserves the missingness pattern, which often is informative."""

# select the column named "state_of_building" from the training dataset X_train, the method .fillna() replaces all NaN (missing) values
X_train['state_of_building'] = X_train['state_of_building'].fillna("unknown")

# apply the same cleaning operation to the test set X_test
X_test['state_of_building'] = X_test['state_of_building'].fillna("unknown")

### 2) Encoding: Converting categorical data into numeric features with encoding

#### One-Hot Encoding for "type" and "province" column

In [88]:
from sklearn.preprocessing import OneHotEncoder
# "type" column:
# create an instance of the OneHotEncoder class that will convert categories into binary columns, 
# handle_unknown="ignore": prevents errors raised from test set and sparse_output =False produces a normal NumPy array instead of a memory-efficient sparse matrix
ohe_type = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# fit only on train
type_train = ohe_type.fit_transform(X_train[["type"]])
type_test = ohe_type.transform(X_test[["type"]])

# convert the encoded arrays back into DataFrames
type_train_df = pd.DataFrame(type_train, columns=ohe_type.get_feature_names_out(["type"]), index=X_train.index)
type_test_df = pd.DataFrame(type_test, columns=ohe_type.get_feature_names_out(["type"]), index=X_test.index)

In [89]:
from sklearn.preprocessing import OneHotEncoder
# same for "province" column:
ohe_province = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

province_train = ohe_province.fit_transform(X_train[["province"]])
province_test = ohe_province.transform(X_test[["province"]])

province_train_df = pd.DataFrame(province_train, columns=ohe_province.get_feature_names_out(["province"]), index=X_train.index)
province_test_df = pd.DataFrame(province_test, columns=ohe_province.get_feature_names_out(["province"]), index=X_test.index)

#### LabelEncoder for "subtype" column

In [90]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# "subtype" column:

# create LabelEncoder object
le_subtype = LabelEncoder()

# LabelEncoder returns 1D array of integers so you can directly assign it into a DataFrame column
# do not fit on X_test to avoid data leakage: test gets transformed using the mapping already learned
X_train["subtype_le"] = le_subtype.fit_transform(X_train["subtype"])
X_test["subtype_le"] = le_subtype.transform(X_test["subtype"])

### OrdinalEncoder for "state_of_building" column

In [91]:
# Get a list of the unique categories in the "state_of_building" column
X_train["state_of_building"].unique()


array(['New', 'Excellent', 'Normal', 'unknown', 'To be renovated',
       'Fully renovated', 'To renovate', 'To restore', 'To demolish',
       'Under construction'], dtype=object)

In [92]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# 1. Define the custom order
state_order = [
    [
        "unknown",
        "To demolish",
        "Under construction",
        "To restore",
        "To renovate",
        "To be renovated",
        "Normal",
        "Fully renovated",
        "Excellent",
        "New"
    ]
]

# 2. Create the encoder
ord_enc = OrdinalEncoder(categories=state_order)

# 3. Fit on training data only
state_train = ord_enc.fit_transform(X_train[["state_of_building"]])
state_test = ord_enc.transform(X_test[["state_of_building"]])

# 4. Convert to DataFrames
# Why flatten()? OrdinalEncoder returns a 2-D array with shape (n_rows, 1). But a DataFrame column needs a 1-D array.
# So this turns it into a proper single column
state_train_df = pd.DataFrame(
    {"state_oe": state_train.flatten()},
    index=X_train.index
)

state_test_df = pd.DataFrame(
    {"state_oe": state_test.flatten()},
    index=X_test.index
)


### 3) Standardization (Feature scaling) for continuous numerical columns (not the encoded ones)

The correct way (to avoid leakage):
StandardScaler must be fit only on X_train and applied to both train and test with the same learned parameters

In [93]:
from sklearn.preprocessing import StandardScaler

# manually list the columns you want to standardize
# another option for the future: automatic detection of numerical columns
num_cols = ["living_area (m²)", "number_of_bedrooms", "number_facades", "terrace_area (m²)"]

scaler = StandardScaler()

# Fit only on train (no leakage)
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

# Transform test using the same scaler
X_test[num_cols] = scaler.transform(X_test[num_cols])

### 4) Build final training  and test DataFrames: assemble all transformed features into a final dataset
To be able to see the results of the preprocessing steps that were applied to X_train and X_test, we need to manually combine the transformed arrays back into a DataFrame (see preprocessing_notebook.ipynb for the code, for now skip this part) 

In [94]:
""" This step is crucial —  cannot train on X_train directly because:
some columns were encoded → new DataFrames exist (type_train_df, province_train_df, etc.)
some columns were dropped (like type, province, state_of_building, subtype)
some columns were scaled"""

# Ensure state_oe is not already in X_train or X_test from previous processing
if "state_oe" in X_train.columns:
    X_train = X_train.drop(columns=["state_oe"])

if "state_oe" in X_test.columns:
    X_test = X_test.drop(columns=["state_oe"])

# 1. Columns to remove (original categorical features now encoded)
drop_cols = ["type", "subtype", "state_of_building", "province"]

# 2. Base = all columns not encoded + already imputed + already scaled
X_train_base = X_train.drop(columns=drop_cols)
X_test_base = X_test.drop(columns=drop_cols)

# 3. Add all encoded & scaled feature DataFrames
X_train_final = pd.concat([
    X_train_base,
    type_train_df,
    province_train_df,
    state_train_df,     # from OrdinalEncoding
], axis=1)

X_test_final = pd.concat([
    X_test_base,
    type_test_df,
    province_test_df,
    state_test_df,
], axis=1)

print("Final feature count:", X_train_final.shape[1])
X_train_final.head()



Final feature count: 25


Unnamed: 0,number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",...,province_Brussels,province_East-Flanders,province_Flemish-Brabant,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_West-Flanders,state_oe
1683,0.194195,-0.333929,0.0,0.0,0.0,1.0,0.007449,1.0,1.580107,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
7223,2.259871,1.448749,1.0,0.0,0.0,1.0,0.0,1.0,1.580107,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,8.0
9249,0.882754,0.081086,1.0,0.0,0.0,1.0,1.359691,1.0,0.175982,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0
6150,-0.494364,0.411212,0.0,0.0,0.0,0.0,0.0,0.0,0.175982,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6522,-1.182922,-1.003612,1.0,0.0,0.0,0.0,0.0,0.0,-1.228143,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0


In [95]:
# Safety check: confirm duplicates are gone (if empty: safe!)
#X_train_final.columns[X_train_final.columns.duplicated()]


## Model training

## 1) Linear Regression Model

In [96]:
# Function to train the model and
# Function to evaluate the model (MAE, RMSE, R²)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def train_linear_regression(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Predictions
    train_preds = model.predict(X_train)
    test_preds  = model.predict(X_test)

    # Metrics
    mae  = mean_absolute_error(y_test, test_preds)
    rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    r2_train = r2_score(y_train, train_preds)
    r2_test  = r2_score(y_test, test_preds)

    # Print results
    print("Model Evaluation Results:")
    print(f"MAE (test):   {mae:.2f}")
    print(f"RMSE (test):  {rmse:.2f}")
    print(f"R² (train):   {r2_train:.4f}")
    print(f"R² (test):    {r2_test:.4f}")

    return train_preds, test_preds


In [97]:
# Train the model
model_lr = train_linear_regression(X_train_final, y_train)

# Evaluate on the test set
train_preds, test_preds = evaluate_model(
    model_lr,
    X_train_final, y_train,
    X_test_final, y_test)


Model Evaluation Results:
MAE (test):   107562.48
RMSE (test):  240518.24
R² (train):   0.5279
R² (test):    0.4273


## 2) Random Forest Model

### Clean X selection

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load CSV
df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")

def prepare_features_and_target(df):
    """
    Drops unwanted columns and returns X_train, X_test, y_train, y_test.
    """
    # Columns to drop
    columns_to_drop = [
        "price (€)",
        "property_ID",
        "locality_name",
        "postal_code",
        # Note: "price_per_square_meter" is not included in current CSV
    ]
    
    # 1. Create X and y
    X = df.drop(columns=columns_to_drop)
    y = df["price (€)"]
    
    # 2. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=777
    )
    
    return X_train, X_test, y_train, y_test


### Imputation for numerical and categorical columns to handle NaNs

In [78]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Function to impute numerical columns
def impute_numeric_columns(X_train, X_test):
    """
    Imputes numerical columns in X_train and X_test using the mean of X_train.
    Returns the updated X_train, X_test, and the list of numerical columns.
    """

    # Select numerical columns automatically
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

    # Create imputer (mean strategy)
    imputer = SimpleImputer(strategy="mean")

    # Fit on training data
    X_train[num_cols] = imputer.fit_transform(X_train[num_cols])

    # Transform test data using SAME means
    X_test[num_cols] = imputer.transform(X_test[num_cols])

    return X_train, X_test, num_cols

# Function to impute categorical column "state of building"
def impute_categorical_state(X_train, X_test, column="state_of_building"):
    """
    Fills missing values in the state_of_building column using the 'unknown' category.
    """
    X_train[column] = X_train[column].fillna("unknown")
    X_test[column] = X_test[column].fillna("unknown")
    
    return X_train, X_test



### Encoding: LabelEncoding and OrdinalEncoding

In [79]:
from sklearn.preprocessing import LabelEncoder

def label_encode_columns(X_train, X_test, columns):
    """
    Label-encodes multiple categorical columns using separate LabelEncoders.
    No data leakage: fits only on X_train.
    Returns transformed X_train, X_test, and a dictionary of encoders.
    """
    encoders = {}
    
    for col in columns:
        le = LabelEncoder()
        
        X_train[col + "_le"] = le.fit_transform(X_train[col])
        X_test[col + "_le"] = le.transform(X_test[col])
        
        encoders[col] = le  # store encoder
        
    return X_train, X_test, encoders

from sklearn.preprocessing import OrdinalEncoder

def ordinal_encode_state(X_train, X_test, column="state_of_building"):
    """
    Applies Ordinal Encoding to the state_of_building column using a fixed custom order.
    Fits only on X_train (no leakage).
    """
    
    state_order = [[
        "unknown",
        "To demolish",
        "Under construction",
        "To restore",
        "To renovate",
        "To be renovated",
        "Normal",
        "Fully renovated",
        "Excellent",
        "New"
    ]]
    
    oe = OrdinalEncoder(categories=state_order)
    
    train_encoded = oe.fit_transform(X_train[[column]])
    test_encoded = oe.transform(X_test[[column]])
    
    X_train[column + "_oe"] = train_encoded.flatten()
    X_test[column + "_oe"] = test_encoded.flatten()
    
    return X_train, X_test, oe



### Final assembly

In [80]:
# Step 1: Load CSV
#df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")

# Step 2: Prepare X and y
X_train, X_test, y_train, y_test = prepare_features_and_target(df)

# Step 3: Impute numerical values
X_train, X_test, num_cols = impute_numeric_columns(X_train, X_test)

# Step 4: Impute categorical missing values
X_train, X_test = impute_categorical_state(X_train, X_test)

# Step 5: LabelEncoding for "type", "subtype" and "province" columns
X_train, X_test, le_encoders = label_encode_columns(
    X_train,
    X_test,
    columns=["type", "province", "subtype"]
)

# Step 6: OrdinalEncoding for "state_of_building" column
X_train, X_test, state_encoder = ordinal_encode_state(X_train, X_test, column="state_of_building")

# Step 7: Assembly everything together
X_train_final = X_train.drop(columns=["type", "subtype", "state_of_building", "province"])
X_test_final  = X_test.drop(columns=["type", "subtype", "state_of_building", "province"])

# y_train & y_test remain unchanged
y_train = y_train
y_test = y_test


## Random Forest Training + Evaluation

In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


# 1. Create and train the model
rf = RandomForestRegressor(
    n_estimators=300,       # number of trees
    max_depth=15,           # limit tree depth
    min_samples_split=10,   # require more samples to split
    min_samples_leaf=5,     # prevent tiny leaf nodes
    random_state=777,
    n_jobs=-1               # use all CPU cores
)

rf.fit(X_train_final, y_train)

# 2. Predictions
y_pred_train = rf.predict(X_train_final)
y_pred_test = rf.predict(X_test_final)

# 3. Evaluation
# Train set metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_r2 = r2_score(y_train, y_pred_train)

# Test set metrics
test_mae = mean_absolute_error(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
test_r2 = r2_score(y_test, y_pred_test)

print("RANDOM FOREST PERFORMANCE")
print("\n--- TRAIN SET ---")
print(f"MAE:  {train_mae:.2f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"R²:   {train_r2:.3f}")

print("\n--- TEST SET ---")
print(f"MAE:  {test_mae:.2f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"R²:   {test_r2:.3f}")


RANDOM FOREST PERFORMANCE

--- TRAIN SET ---
MAE:  57301.35
RMSE: 107100.99
R²:   0.820

--- TEST SET ---
MAE:  79236.76
RMSE: 218794.06
R²:   0.526
