In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))  # add project root

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
plt.style.use("seaborn-v0_8-whitegrid")

#from pipelines.pipeline_runner import run_full_pipeline_with_split  
#from utils.ml_utils import prepare_X_y  

from pipelines.export_for_kaggle import export_for_kaggle  

from utils.stage3_utils import (
    fit_stage3,
    transform_stage3,
    prepare_X_y
)




## Pre-processing script

In [2]:
export_for_kaggle()

Exporting pre-processed data for Kaggle

[1/3] Running Stage 1 (Basic Cleaning)...
   ✓ Stage 1 complete: 22,306 rows, 71 columns

[2/3] Running Stage 2 (Plausibility & Outliers)...
   ✓ Stage 2 complete: 20,900 rows, 42 columns

[3/3] Running Stage 2.5 (Geo Enrichment)...
   ✓ Stage 2.5 complete: 21,472 rows, 59 columns

[SAVE] Writing to ../data/pre_processed/pre_processed_data_for_kaggle.csv...
   ✓ Saved successfully (9.55 MB)

EXPORT COMPLETE


Unnamed: 0,url,property_id,price,rooms,area,state,facades_number,is_furnished,has_terrace,has_garden,has_swimming_pool,has_equipped_kitchen,build_year,cellar,has_garage,kitchen_surface_house,bathrooms,heating_type,terrace_surface_apartment,land_surface_house,sewer_connection,running_water,primary_energy_consumption,co2_house,certification_electrical_installation,preemption_right,flooding_area_type,leased,living_room_surface,attic_house,glazing_type,elevator,entry_phone_apartment,access_disabled,apartement_floor_apartment,number_floors_apartment,toilets,cadastral_income_house,property_subtype,postal_code,locality,property_type,municipality_nl,municipality_fr,arrondissement_nl,arrondissement_fr,province_nl,province_fr,median_income,region,address,province,apt_avg_m2_province,house_avg_m2_province,apt_avg_m2_region,house_avg_m2_region,province_benchmark_m2,region_benchmark_m2,national_benchmark_m2
0,https://immovlan.be/en/detail/apartment/for-sa...,rbu69315,269000.00,1.00,51.00,-1,-1.00,0,1,0,-1,-1,1968.00,1,1,-1.00,1.00,-1,-1.00,-1.00,-1,-1,184.00,-1.00,-1,0,-1,-1,-1.00,-1.00,-1,-1.00,-1.00,-1,-1.00,-1.00,-1.00,-1.00,apartment,2600,berchem,Apartment,Antwerpen,Anvers,Arrondissement Antwerpen,Arrondissement d’Anvers,Provincie Antwerpen,Province d’Anvers,23.99,Flanders,Grote Steenweg 227 203 2600 Berchem,Provincie Antwerpen,2849,2419,3133,2266,2849,3133,3091
1,https://immovlan.be/en/detail/apartment/for-sa...,rbu69316,299000.00,2.00,79.00,2,2.00,0,1,0,-1,-1,2011.00,-1,1,-1.00,1.00,5,5.00,-1.00,-1,1,100.00,-1.00,1,1,1,0,22.00,-1.00,-1,1.00,-1.00,-1,2.00,-1.00,1.00,-1.00,apartment,2050,antwerp,Apartment,Antwerpen,Anvers,Arrondissement Antwerpen,Arrondissement d’Anvers,Provincie Antwerpen,Province d’Anvers,23.99,Flanders,Hugues C. Pernathlaan 22 203 2050 Antwerp,Provincie Antwerpen,2849,2419,3133,2266,2849,3133,3091
2,https://immovlan.be/en/detail/apartment/for-sa...,rbu68996,359000.00,3.00,110.00,-1,-1.00,0,1,0,-1,-1,-1.00,-1,1,-1.00,1.00,3,-1.00,-1.00,-1,1,-1.00,-1.00,-1,-1,-1,0,-1.00,-1.00,-1,0.00,-1.00,-1,2.00,-1.00,1.00,-1.00,apartment,2610,wilrijk,Apartment,Antwerpen,Anvers,Arrondissement Antwerpen,Arrondissement d’Anvers,Provincie Antwerpen,Province d’Anvers,23.99,Flanders,,Provincie Antwerpen,2849,2419,3133,2266,2849,3133,3091
3,https://immovlan.be/en/detail/apartment/for-sa...,rbu69309,239000.00,2.00,84.00,2,2.00,0,1,0,-1,-1,1963.00,-1,-1,-1.00,1.00,3,-1.00,-1.00,-1,1,163.00,-1.00,0,1,-1,0,-1.00,-1.00,-1,1.00,-1.00,-1,5.00,-1.00,-1.00,-1.00,apartment,2060,antwerp,Apartment,Antwerpen,Anvers,Arrondissement Antwerpen,Arrondissement d’Anvers,Provincie Antwerpen,Province d’Anvers,23.99,Flanders,Kerkstraat 2 501 2060 Antwerp,Provincie Antwerpen,2849,2419,3133,2266,2849,3133,3091
4,https://immovlan.be/en/detail/apartment/for-sa...,rbu69307,189000.00,2.00,80.00,2,2.00,0,0,0,-1,-1,1930.00,-1,-1,-1.00,-1.00,-1,-1.00,-1.00,-1,-1,149.00,-1.00,1,0,1,0,-1.00,-1.00,-1,0.00,-1.00,-1,1.00,-1.00,-1.00,-1.00,apartment,2610,wilrijk,Apartment,Antwerpen,Anvers,Arrondissement Antwerpen,Arrondissement d’Anvers,Provincie Antwerpen,Province d’Anvers,23.99,Flanders,Golfstraat 7 101 2610 Wilrijk,Provincie Antwerpen,2849,2419,3133,2266,2849,3133,3091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21467,https://immovlan.be/en/detail/villa/for-sale/1...,rbt75700,1550000.00,10.00,538.00,-1,4.00,-1,0,0,-1,-1,-1.00,-1,1,-1.00,3.00,-1,-1.00,2711.00,-1,-1,-1.00,-1.00,-1,1,1,0,-1.00,1.00,-1,0.00,-1.00,0,-1.00,2.00,-1.00,-1.00,villa,1980,zemst,House,Zemst,Zemst,Arrondissement Halle-Vilvoorde,Arrondissement de Hal-Vilvorde,Provincie Vlaams-Brabant,Province du Brabant flamand,32.36,Flanders,Bosstraat 46 1980 Zemst,Provincie Vlaams-Brabant,3260,2539,3133,2266,2539,2266,2076
21468,https://immovlan.be/en/detail/villa/for-sale/1...,rbu23616,749000.00,5.00,245.00,2,4.00,-1,1,1,1,2,-1.00,-1,1,12.00,2.00,2,-1.00,1842.00,-1,-1,462.00,-1.00,0,1,1,-1,50.00,-1.00,1,-1.00,-1.00,-1,-1.00,2.00,3.00,-1.00,villa,1980,zemst,House,Zemst,Zemst,Arrondissement Halle-Vilvoorde,Arrondissement de Hal-Vilvorde,Provincie Vlaams-Brabant,Province du Brabant flamand,32.36,Flanders,,Provincie Vlaams-Brabant,3260,2539,3133,2266,2539,2266,2076
21469,https://immovlan.be/en/detail/residence/for-sa...,rbt43515,349000.00,3.00,140.00,-1,3.00,-1,1,0,-1,2,1937.00,-1,1,-1.00,1.00,-1,-1.00,95.00,-1,-1,249.00,-1.00,-1,1,1,0,-1.00,-1.00,-1,0.00,-1.00,0,-1.00,-1.00,2.00,-1.00,house,1980,eppegem,House,Zemst,Zemst,Arrondissement Halle-Vilvoorde,Arrondissement de Hal-Vilvorde,Provincie Vlaams-Brabant,Province du Brabant flamand,32.36,Flanders,Brusselsesteenweg 448 1980 Eppegem,Provincie Vlaams-Brabant,3260,2539,3133,2266,2539,2266,2076
21470,https://immovlan.be/en/detail/villa/for-sale/1...,vbc94354,3140000.00,4.00,492.00,4,4.00,0,1,1,0,3,2025.00,1,-1,46.00,1.00,-1,126.00,1253.00,1,1,66.00,-1.00,-1,-1,1,-1,70.00,-1.00,1,-1.00,1.00,1,-1.00,2.00,3.00,-1.00,villa,1933,sterrebeek,House,Zaventem,Zaventem,Arrondissement Halle-Vilvoorde,Arrondissement de Hal-Vilvorde,Provincie Vlaams-Brabant,Province du Brabant flamand,29.89,Flanders,du Roy de Bliquylaan 33 1933 Sterrebeek,Provincie Vlaams-Brabant,3260,2539,3133,2266,2539,2266,2076


## Best models House vs Apartment

### Houses

In [16]:
# ============================================================
# FINAL PRODUCTION MODEL — HOUSE-ONLY XGBOOST (FULL DATA)
# Saves to ./models/  => model_xgb_house.pkl + stage3_pipeline_house.pkl
# ============================================================

import pandas as pd
import numpy as np
import joblib
import os
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

# ========================================
# CONFIG
# ========================================
DATA_PATH = "../data/pre_processed/pre_processed_data_for_kaggle.csv"
RANDOM_STATE = 42
MODELS_DIR = "../models/"

# Create directory if not exists
os.makedirs(MODELS_DIR, exist_ok=True)

# ========================================
# REDUCED FEATURE LIST
# ========================================
REDUCED_FEATURES = [
    "area",
    "postal_code_te_price",
    "locality_te_price",
    "bathrooms",
    "rooms",
    "primary_energy_consumption",
    "state",
    "province_benchmark_m2",
    "postal_code",
    "region_benchmark_m2",
    "property_subtype_te_price",
    "apt_avg_m2_region",
    "toilets",
    "property_type_te_price",
    "median_income",
    "build_year",
    "house_avg_m2_province",
    "has_garage",
    "apt_avg_m2_province",
    "has_garden",
    "has_terrace",
    "facades_number",
    "has_swimming_pool",
    "house_avg_m2_region",
    "has_equipped_kitchen",
]

# ========================================
# 1) LOAD DATA — HOUSE ONLY
# ========================================
df = pd.read_csv(DATA_PATH)
df = df[df["property_type"] == "House"].reset_index(drop=True)

print(f"Loaded {len(df)} house samples.")

# ========================================
# 2) FIT STAGE 3 ON FULL HOUSE DATA
# ========================================
fitted_stage3 = fit_stage3(df)

# ========================================
# 3) TRANSFORM FULL DATASET
# ========================================
df_s3 = transform_stage3(df, fitted_stage3)

# ========================================
# 4) Final X and y — only reduced features
# ========================================
available_features = [f for f in REDUCED_FEATURES if f in df_s3.columns]

X = df_s3[available_features].copy()
y = df_s3["price"].copy()

print(f"Using {len(available_features)} reduced features.")

# ========================================
# 5) Train FINAL Production Model (No CV)
# ========================================
model_xgb_house = XGBRegressor(
    n_estimators=1229,
    max_depth=8,
    learning_rate=0.027411641476326184,
    subsample=0.9518670107980847,
    colsample_bytree=0.975446470175863,
    min_child_weight=3,
    reg_alpha=0.39932030491890924,
    reg_lambda=0.1578535051938761,
    gamma=0.3405484221326213,
    objective="reg:squarederror",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("\nTraining FINAL House XGBoost model on FULL DATASET...")
model_xgb_house.fit(X, y)

# ========================================
# 6) Save Model + Preprocessing INTO /models/
# ========================================
joblib.dump(model_xgb_house, os.path.join(MODELS_DIR, "model_xgb_house.pkl"))
joblib.dump(fitted_stage3, os.path.join(MODELS_DIR, "stage3_pipeline_house.pkl"))

print("\nSaved:")
print(f"- {MODELS_DIR}model_xgb_house.pkl")
print(f"- {MODELS_DIR}stage3_pipeline_house.pkl")
print("\nFINAL HOUSE MODEL READY FOR PRODUCTION.")


Loaded 9682 house samples.
Using 25 reduced features.

Training FINAL House XGBoost model on FULL DATASET...

Saved:
- ../models/model_xgb_house.pkl
- ../models/stage3_pipeline_house.pkl

FINAL HOUSE MODEL READY FOR PRODUCTION.


### Apartments

In [17]:
# ============================================================
# FINAL PRODUCTION MODEL — APARTMENT-ONLY XGBOOST (FULL DATA)
# Saves to ./models/  => model_xgb_apartment.pkl + stage3_pipeline_apartment.pkl
# ============================================================

import pandas as pd
import numpy as np
import joblib
import os
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

# ========================================
# CONFIG
# ========================================
DATA_PATH = "../data/pre_processed/pre_processed_data_for_kaggle.csv"
RANDOM_STATE = 42
MODELS_DIR = "../models/"

# Create directory if not exists
os.makedirs(MODELS_DIR, exist_ok=True)

# ========================================
# REDUCED FEATURE LIST
# ========================================
REDUCED_FEATURES = [
    "area",
    "postal_code_te_price",
    "locality_te_price",
    "bathrooms",
    "rooms",
    "primary_energy_consumption",
    "state",
    "province_benchmark_m2",
    "postal_code",
    "region_benchmark_m2",
    "property_subtype_te_price",
    "apt_avg_m2_region",
    "toilets",
    "property_type_te_price",
    "median_income",
    "build_year",
    "house_avg_m2_province",
    "has_garage",
    "apt_avg_m2_province",
    "has_garden",
    "has_terrace",
    "facades_number",
    "has_swimming_pool",
    "house_avg_m2_region",
    "has_equipped_kitchen",
]

# ========================================
# 1) LOAD DATA — APARTMENTS ONLY
# ========================================
df = pd.read_csv(DATA_PATH)
df = df[df["property_type"] == "Apartment"].reset_index(drop=True)

print(f"Loaded {len(df)} apartment samples.")

# ========================================
# 2) FIT STAGE 3 ON FULL APARTMENT DATA
# ========================================
fitted_stage3 = fit_stage3(df)

# ========================================
# 3) TRANSFORM FULL DATASET
# ========================================
df_s3 = transform_stage3(df, fitted_stage3)

# ========================================
# 4) Final X and y — only reduced features
# ========================================
available_features = [f for f in REDUCED_FEATURES if f in df_s3.columns]

X = df_s3[available_features].copy()
y = df_s3["price"].copy()

print(f"Using {len(available_features)} reduced features.")

# ========================================
# 5) Train FINAL Production Model (No CV)
# ========================================
model_xgb_apartment = XGBRegressor(
    n_estimators=736,
    max_depth=14,
    learning_rate=0.07398859493788239,
    subsample=0.9757468164178891,
    colsample_bytree=0.550230128735367,
    min_child_weight=1,
    reg_alpha=2.829601360344222,
    reg_lambda=3.189797401823491,
    gamma=0.869368954979705,
    objective="reg:squarederror",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("\nTraining FINAL Apartment XGBoost model on FULL DATASET...")
model_xgb_apartment.fit(X, y)

# ========================================
# 6) Save Model + Preprocessing INTO /models/
# ========================================
joblib.dump(model_xgb_apartment, os.path.join(MODELS_DIR, "model_xgb_apartment.pkl"))
joblib.dump(fitted_stage3, os.path.join(MODELS_DIR, "stage3_pipeline_apartment.pkl"))

print("\nSaved:")
print(f"- {MODELS_DIR}model_xgb_apartment.pkl")
print(f"- {MODELS_DIR}stage3_pipeline_apartment.pkl")
print("\nFINAL APARTMENT MODEL READY FOR PRODUCTION.")


Loaded 11790 apartment samples.
Using 25 reduced features.

Training FINAL Apartment XGBoost model on FULL DATASET...

Saved:
- ../models/model_xgb_apartment.pkl
- ../models/stage3_pipeline_apartment.pkl

FINAL APARTMENT MODEL READY FOR PRODUCTION.


## Test

In [18]:
import pandas as pd
import joblib

# ===============================================
# LOAD MODELS + PIPELINES
# ===============================================
model_house = joblib.load("../models/model_xgb_house.pkl")
stage3_house = joblib.load("../models/stage3_pipeline_house.pkl")

model_apartment = joblib.load("../models/model_xgb_apartment.pkl")
stage3_apartment = joblib.load("../models/stage3_pipeline_apartment.pkl")

# Same reduced feature list used at training
REDUCED_FEATURES = [
    "area",
    "postal_code_te_price",
    "locality_te_price",
    "bathrooms",
    "rooms",
    "primary_energy_consumption",
    "state",
    "province_benchmark_m2",
    "postal_code",
    "region_benchmark_m2",
    "property_subtype_te_price",
    "apt_avg_m2_region",
    "toilets",
    "property_type_te_price",
    "median_income",
    "build_year",
    "house_avg_m2_province",
    "has_garage",
    "apt_avg_m2_province",
    "has_garden",
    "has_terrace",
    "facades_number",
    "has_swimming_pool",
    "house_avg_m2_region",
    "has_equipped_kitchen",
]


# ============================================================
# FUNCTION — Build a DataFrame and run prediction
# ============================================================
def predict_price(property_dict):
    df = pd.DataFrame([property_dict])

    if df.loc[0, "property_type"] == "House":
        df_s3 = transform_stage3(df, stage3_house)
        model = model_house
    else:
        df_s3 = transform_stage3(df, stage3_apartment)
        model = model_apartment

    # Select only available features
    X = df_s3[[f for f in REDUCED_FEATURES if f in df_s3.columns]]

    price_pred = model.predict(X)[0]
    return round(price_pred, 0)


# ============================================================
# TEST INPUT — APARTMENT EXAMPLE
# (Include metadata fields exactly as produced in your pipeline)
# ============================================================
apartment_test = {
    "property_type": "Apartment",
    "area": 85,
    "bathrooms": 1,
    "rooms": 3,
    "primary_energy_consumption": 210,
    "state": 2,
    "postal_code": 1050,
    "toilets": 1,
    "build_year": 2005,
    "has_garage": 0,
    "has_garden": 0,
    "has_terrace": 1,
    "facades_number": 2,
    "has_swimming_pool": 0,
    "has_equipped_kitchen": 1,

    # ===== Metadata (FROM YOUR TABLES) =====
    "postal_code_te_price": 420000,
    "locality_te_price": 415000,
    "province_benchmark_m2": 3500,
    "region_benchmark_m2": 2900,
    "property_subtype_te_price": 400000,
    "apt_avg_m2_region": 3200,
    "property_type_te_price": 390000,
    "median_income": 35000,
    "house_avg_m2_province": 2800,
    "apt_avg_m2_province": 3150,
    "house_avg_m2_region": 2500,
}

print("Apartment predicted price:", predict_price(apartment_test))


# ============================================================
# TEST INPUT — HOUSE EXAMPLE
# ============================================================
house_test = {
    "property_type": "House",
    "area": 165,
    "bathrooms": 2,
    "rooms": 5,
    "primary_energy_consumption": 310,
    "state": 3,
    "postal_code": 1700,
    "toilets": 2,
    "build_year": 1985,
    "has_garage": 1,
    "has_garden": 1,
    "has_terrace": 1,
    "facades_number": 4,
    "has_swimming_pool": 0,
    "has_equipped_kitchen": 1,

    # ===== Metadata =====
    "postal_code_te_price": 550000,
    "locality_te_price": 530000,
    "province_benchmark_m2": 2800,
    "region_benchmark_m2": 2500,
    "property_subtype_te_price": 600000,
    "apt_avg_m2_region": 2400,
    "property_type_te_price": 580000,
    "median_income": 37000,
    "house_avg_m2_province": 3150,
    "apt_avg_m2_province": 2600,
    "house_avg_m2_region": 3000,
}

print("House predicted price:", predict_price(house_test))


Apartment predicted price: 377385.0
House predicted price: 937186.0
