# Preprocessing

In [52]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.impute import KNNImputer

# Load data
train = pd.read_csv("C:/Users/fabri/Desktop/uni/MSDS/2024-2025/Semester 2/Advanced Analytics in a Big Data World/Project/Assignment 1/Data/train.csv")
test = pd.read_csv("C:/Users/fabri/Desktop/uni/MSDS/2024-2025/Semester 2/Advanced Analytics in a Big Data World/Project/Assignment 1/Data/test.csv")

In [53]:
# Mapping dwelling types to categories
dwelling_map = {
    **{dw: "Apartments" for dw in [
        "Appartement", "Appartementengebouw", "Assistentie-appartement", "Bel-Étage", "Dakappartement",
        "Duplex", "Gelijkvloers app.", "Loft", "Penthouse", "Serviceflat", "Studio", "Studio met slaaphoek"
    ]},
    **{dw: "Houses" for dw in [
        "Arbeiderswoning", "Boerderij", "Bungalow", "Burgerswoning", "Cottage", "Eengezinswoning",
        "Fermette", "Herenhuis", "Herenwoning", "Hoekwoning", "Hoeve", "Kangoeroewoning", "Koppelvilla",
        "Koppelwoning", "Moderne villa", "Pastorijwoning", "Rijwoning", "Villa", "Villa-landhuis", "Woning"
    ]},
    **{dw: "Other" for dw in [
        "Andere", "Buitenverblijf", "Gemengd gebruik", "Kasteel", "Uitzonderlijke woning",
        "Vakantiewoning", "Woonboot"
    ]}
}
train["Category"] = train["subtype"].map(dwelling_map).fillna("Other")
test["Category"] = test["subtype"].map(dwelling_map).fillna("Other")

In [54]:
# Classify location based on postcode
def classify_location(postcode):
    if pd.isna(postcode):
        return "Unknown"  # Handle missing postcodes
    
    postcode = str(postcode).zfill(4)  # Ensure it's a 4-digit string
    first_digit, second_digit, third_digit = map(int, postcode[:3])  # Convert only once
    
    if first_digit in {1, 2, 3, 4, 9} and second_digit in {0, 1, 2, 3, 4} and third_digit in {0, 1}:
        return "Urban"
    elif (
        (first_digit in {1, 2, 3, 4, 9} and second_digit in {5, 6, 7}) or
        (first_digit in {5, 6, 7, 8} and second_digit in {0, 1, 2, 3, 4}) or
        (first_digit in {5, 6, 7, 8} and second_digit in {5, 6} and third_digit in {0, 1, 2, 3, 4, 5})
    ):
        return "Suburban"
    else:
        return "Rural"

train["Location_Type"] = train["postcode"].apply(classify_location)
test["Location_Type"] = test["postcode"].apply(classify_location)

In [55]:
# Energy efficiency classification
energy_bins = np.array([-float("inf"), 100, 200, 300, 400, 500, float("inf")])
energy_labels = np.array(["A", "B", "C", "D", "E", "F"])

def classify_energy_label(energy):
    if pd.isna(energy):
        return "Unknown"
    return energy_labels[np.searchsorted(energy_bins, energy, side="right") - 1]

train["energy_label"] = train["energy_value"].apply(classify_energy_label)
test["energy_label"] = test["energy_value"].apply(classify_energy_label)

In [56]:
# Impute missing numerical values with category-wise median
imputed_cols = ["area", "energy_value"]
train[imputed_cols] = train.groupby("Category")[imputed_cols].transform(lambda x: x.fillna(x.median()))
test[imputed_cols] = test.groupby("Category")[imputed_cols].transform(lambda x: x.fillna(x.median()))

In [57]:
# KNN Imputation for lat/lon
imputer = KNNImputer(n_neighbors=5)
train[["lat", "lon"]] = imputer.fit_transform(train[["lat", "lon"]])
test[["lat", "lon"]] = imputer.transform(test[["lat", "lon"]])

In [58]:
# Handle outliers with Winsorization
def winsorize(series, lower_quantile=0.01, upper_quantile=0.99):
    lower_bound = series.quantile(lower_quantile)
    upper_bound = series.quantile(upper_quantile)
    return np.clip(series, lower_bound, upper_bound)

outlier_cols = ["area", "energy_value", "foto_amount", "price"]
for col in outlier_cols:
    train[col] = winsorize(train[col])
    if col != "price":  
        test[col] = winsorize(test[col])

In [59]:
# One-hot encoding for categorical variables
train = pd.get_dummies(train, columns=["Location_Type", "energy_label"], drop_first=True)
test = pd.get_dummies(test, columns=["Location_Type", "energy_label"], drop_first=True)

# Ordinal encoding for province
train["province"] = train["province"].astype("category").cat.codes
test["province"] = test["province"].astype("category").cat.codes

# Ordinal encoding for Category
train["Category"] = train["Category"].astype("category").cat.codes
test["Category"] = test["Category"].astype("category").cat.codes

In [60]:
# Normalize Numerical Features
scaler_features = MinMaxScaler()
power_transformer = PowerTransformer(method="yeo-johnson")

continuous_cols_train = ["area", "energy_value", "foto_amount"]
continuous_cols_price = ["price"]

scaler_features.fit(train[continuous_cols_train])
train[continuous_cols_train] = scaler_features.transform(train[continuous_cols_train])
test[continuous_cols_train] = scaler_features.transform(test[continuous_cols_train])

train["price"] = power_transformer.fit_transform(train[["price"]])

In [61]:
# Drop unnecessary columns
train = train.drop(columns=["id", "advertiser", "subtype", "sticker", "price_drop_date"])
test = test.drop(columns=["advertiser", "subtype", "sticker", "price_drop_date"])

# Model Training

In [62]:
# Hyperparameter Tuning with Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "lambda": trial.suggest_float("lambda", 1, 10),
        "alpha": trial.suggest_float("alpha", 1, 10),
        "objective": "reg:pseudohubererror",
        "random_state": 42,
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_valid, y_pred)

X = train.drop(columns=["price"])
y = train["price"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best parameters:", best_params)


[I 2025-03-16 16:36:30,229] A new study created in memory with name: no-name-8e51e5f1-de4f-42a1-9d9b-b2db1f4fdd96


[I 2025-03-16 16:36:33,681] Trial 0 finished with value: 0.4157024536959632 and parameters: {'n_estimators': 676, 'max_depth': 7, 'learning_rate': 0.2549782268318953, 'subsample': 0.6549769713878331, 'colsample_bytree': 0.6541525956839637, 'lambda': 6.689515472092158, 'alpha': 9.305701779989873}. Best is trial 0 with value: 0.4157024536959632.
[I 2025-03-16 16:36:35,187] Trial 1 finished with value: 0.4068706069975338 and parameters: {'n_estimators': 273, 'max_depth': 7, 'learning_rate': 0.10932371633506967, 'subsample': 0.7385722579472314, 'colsample_bytree': 0.9464676955053899, 'lambda': 6.880211015956981, 'alpha': 4.569902859714276}. Best is trial 1 with value: 0.4068706069975338.
[I 2025-03-16 16:36:38,030] Trial 2 finished with value: 0.41745901569387234 and parameters: {'n_estimators': 495, 'max_depth': 8, 'learning_rate': 0.2215895205261166, 'subsample': 0.689076111943962, 'colsample_bytree': 0.7693836833541697, 'lambda': 4.707556048583692, 'alpha': 5.786328491139965}. Best is t

Best parameters: {'n_estimators': 940, 'max_depth': 11, 'learning_rate': 0.021575363472601142, 'subsample': 0.8346087076429625, 'colsample_bytree': 0.8404657857920002, 'lambda': 3.3033253150594644, 'alpha': 1.0064624232138517}


In [63]:
# Train final model with best hyperparameters
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X, y)

# Compute Mean Absolute Error on validation set
y_valid_pred = best_model.predict(X_valid)
mae_valid = mean_absolute_error(y_valid, y_valid_pred)
print("Validation MAE:", mae_valid)

Validation MAE: 0.15679908694228814


In [64]:
# Predict on test set
test_features = test.drop(columns=["id", "prediction"], errors="ignore")
test_pred = best_model.predict(test_features)

test_pred = power_transformer.inverse_transform(test_pred.reshape(-1, 1)).flatten() # Reverse Power Transformation on target
test["prediction"] = test_pred
print(test[["id", "prediction"]].head(10))

    id     prediction
0  te0  364547.500000
1  te1  410530.343750
2  te2  240151.046875
3  te3  175177.015625
4  te4  331729.250000
5  te5  576378.250000
6  te6  331449.312500
7  te7  744185.187500
8  te8  424458.218750
9  te9  465328.093750




In [66]:
# Compute Prediction Intervals
lower_bound = test_pred * 0.90 # 90% of the prediction
upper_bound = test_pred * 1.10 # 110% of the prediction

# Submission
submission = pd.DataFrame({
    "ID": test["id"],
    "LOWER": lower_bound,
    "UPPER": upper_bound,
    "PRED": test_pred
})
submission.to_csv("submission_xgboost.csv", index=False)