# Linear Regression Model

### Basic setup + Load Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")
#df.head()


### Clean X selection

In [None]:
# Columns to drop
columns_to_drop = [
    "price (€)",
    "property_ID",                # identifier column/indirect leakage
    "locality_name",              # high cardinality (too many categories)
    "postal_code",                # high cardinality unless encoded with one-hot encoding
]

# also column to drop but not included in this csv-file: "price_per_square_meter" because it leaks price

X = df.drop(columns=columns_to_drop)

y = df["price (€)"]

### Train-test split code

In [85]:
# split X and y before imputing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=777
)


## Preprocessing: imputation, encoding, standardization

### 1) Impute numerical columns with missing values 

In [86]:
# mean imputation for missing values for numeric columns using the training set mean

from sklearn.impute import SimpleImputer

# select the columns in X_train with datatype "int" and "float" and extract those column names with .columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

# create an imputer object that will replace missing values (NaN) with the mean of each column
num_imputer = SimpleImputer(strategy="mean")

# .fit: calculate the mean of each numerical column using only the training data
# transform: replace NaNs in X_train[num_cols] with the computed means
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])

# apply the same means learned from X_train to the test set
# do not recompute the mean on the test set (this prevents data leakage!)
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

### Impute missing values in categorical column with "unknown" category

In [87]:
"""For state_of_building column: impute with a new category: "Unknown" = safest and most interpretable approach because:
- you don't invent information that isn't there
- models like tree-based algorithms (RandomForest, XGBoost) can learn whether "unknown" is predictive
- It preserves the missingness pattern, which often is informative."""

# select the column named "state_of_building" from the training dataset X_train, the method .fillna() replaces all NaN (missing) values
X_train['state_of_building'] = X_train['state_of_building'].fillna("unknown")

# apply the same cleaning operation to the test set X_test
X_test['state_of_building'] = X_test['state_of_building'].fillna("unknown")

### 2) Encoding: Converting categorical data into numeric features with encoding

#### One-Hot Encoding for "type" and "province" column

In [88]:
from sklearn.preprocessing import OneHotEncoder
# "type" column:
# create an instance of the OneHotEncoder class that will convert categories into binary columns, 
# handle_unknown="ignore": prevents errors raised from test set and sparse_output =False produces a normal NumPy array instead of a memory-efficient sparse matrix
ohe_type = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# fit only on train
type_train = ohe_type.fit_transform(X_train[["type"]])
type_test = ohe_type.transform(X_test[["type"]])

# convert the encoded arrays back into DataFrames
type_train_df = pd.DataFrame(type_train, columns=ohe_type.get_feature_names_out(["type"]), index=X_train.index)
type_test_df = pd.DataFrame(type_test, columns=ohe_type.get_feature_names_out(["type"]), index=X_test.index)

In [89]:
from sklearn.preprocessing import OneHotEncoder
# same for "province" column:
ohe_province = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

province_train = ohe_province.fit_transform(X_train[["province"]])
province_test = ohe_province.transform(X_test[["province"]])

province_train_df = pd.DataFrame(province_train, columns=ohe_province.get_feature_names_out(["province"]), index=X_train.index)
province_test_df = pd.DataFrame(province_test, columns=ohe_province.get_feature_names_out(["province"]), index=X_test.index)

#### LabelEncoder for "subtype" column

In [90]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# "subtype" column:

# create LabelEncoder object
le_subtype = LabelEncoder()

# LabelEncoder returns 1D array of integers so you can directly assign it into a DataFrame column
# do not fit on X_test to avoid data leakage: test gets transformed using the mapping already learned
X_train["subtype_le"] = le_subtype.fit_transform(X_train["subtype"])
X_test["subtype_le"] = le_subtype.transform(X_test["subtype"])

### OrdinalEncoder for "state_of_building" column

In [91]:
# Get a list of the unique categories in the "state_of_building" column
X_train["state_of_building"].unique()


array(['New', 'Excellent', 'Normal', 'unknown', 'To be renovated',
       'Fully renovated', 'To renovate', 'To restore', 'To demolish',
       'Under construction'], dtype=object)

In [92]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# 1. Define the custom order
state_order = [
    [
        "unknown",
        "To demolish",
        "Under construction",
        "To restore",
        "To renovate",
        "To be renovated",
        "Normal",
        "Fully renovated",
        "Excellent",
        "New"
    ]
]

# 2. Create the encoder
ord_enc = OrdinalEncoder(categories=state_order)

# 3. Fit on training data only
state_train = ord_enc.fit_transform(X_train[["state_of_building"]])
state_test = ord_enc.transform(X_test[["state_of_building"]])

# 4. Convert to DataFrames
# Why flatten()? OrdinalEncoder returns a 2-D array with shape (n_rows, 1). But a DataFrame column needs a 1-D array.
# So this turns it into a proper single column
state_train_df = pd.DataFrame(
    {"state_oe": state_train.flatten()},
    index=X_train.index
)

state_test_df = pd.DataFrame(
    {"state_oe": state_test.flatten()},
    index=X_test.index
)


### 3) Standardization (Feature scaling) for continuous numerical columns (not the encoded ones)

The correct way (to avoid leakage):
StandardScaler must be fit only on X_train and applied to both train and test with the same learned parameters

In [93]:
from sklearn.preprocessing import StandardScaler

# manually list the columns you want to standardize
# another option for the future: automatic detection of numerical columns
num_cols = ["living_area (m²)", "number_of_bedrooms", "number_facades", "terrace_area (m²)"]

scaler = StandardScaler()

# Fit only on train (no leakage)
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

# Transform test using the same scaler
X_test[num_cols] = scaler.transform(X_test[num_cols])

### 4) Build final training  and test DataFrames: assemble all transformed features into a final dataset
To be able to see the results of the preprocessing steps that were applied to X_train and X_test, we need to manually combine the transformed arrays back into a DataFrame (see preprocessing_notebook.ipynb for the code, for now skip this part) 

In [94]:
""" This step is crucial —  cannot train on X_train directly because:
some columns were encoded → new DataFrames exist (type_train_df, province_train_df, etc.)
some columns were dropped (like type, province, state_of_building, subtype)
some columns were scaled"""

# Ensure state_oe is not already in X_train or X_test from previous processing
if "state_oe" in X_train.columns:
    X_train = X_train.drop(columns=["state_oe"])

if "state_oe" in X_test.columns:
    X_test = X_test.drop(columns=["state_oe"])

# 1. Columns to remove (original categorical features now encoded)
drop_cols = ["type", "subtype", "state_of_building", "province"]

# 2. Base = all columns not encoded + already imputed + already scaled
X_train_base = X_train.drop(columns=drop_cols)
X_test_base = X_test.drop(columns=drop_cols)

# 3. Add all encoded & scaled feature DataFrames
X_train_final = pd.concat([
    X_train_base,
    type_train_df,
    province_train_df,
    state_train_df,     # from OrdinalEncoding
], axis=1)

X_test_final = pd.concat([
    X_test_base,
    type_test_df,
    province_test_df,
    state_test_df,
], axis=1)

print("Final feature count:", X_train_final.shape[1])
X_train_final.head()



Final feature count: 25


Unnamed: 0,number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",...,province_Brussels,province_East-Flanders,province_Flemish-Brabant,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_West-Flanders,state_oe
1683,0.194195,-0.333929,0.0,0.0,0.0,1.0,0.007449,1.0,1.580107,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
7223,2.259871,1.448749,1.0,0.0,0.0,1.0,0.0,1.0,1.580107,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,8.0
9249,0.882754,0.081086,1.0,0.0,0.0,1.0,1.359691,1.0,0.175982,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0
6150,-0.494364,0.411212,0.0,0.0,0.0,0.0,0.0,0.0,0.175982,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6522,-1.182922,-1.003612,1.0,0.0,0.0,0.0,0.0,0.0,-1.228143,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0


In [95]:
# Safety check: confirm duplicates are gone (if empty: safe!)
#X_train_final.columns[X_train_final.columns.duplicated()]


## Model training

## 1) Linear Regression Model

In [96]:
# Function to train the model and
# Function to evaluate the model (MAE, RMSE, R²)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def train_linear_regression(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Predictions
    train_preds = model.predict(X_train)
    test_preds  = model.predict(X_test)

    # Metrics
    mae  = mean_absolute_error(y_test, test_preds)
    rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    r2_train = r2_score(y_train, train_preds)
    r2_test  = r2_score(y_test, test_preds)

    # Print results
    print("Model Evaluation Results:")
    print(f"MAE (test):   {mae:.2f}")
    print(f"RMSE (test):  {rmse:.2f}")
    print(f"R² (train):   {r2_train:.4f}")
    print(f"R² (test):    {r2_test:.4f}")

    return train_preds, test_preds


In [97]:
# Train the model
model_lr = train_linear_regression(X_train_final, y_train)

# Evaluate on the test set
train_preds, test_preds = evaluate_model(
    model_lr,
    X_train_final, y_train,
    X_test_final, y_test)


Model Evaluation Results:
MAE (test):   107562.48
RMSE (test):  240518.24
R² (train):   0.5279
R² (test):    0.4273
