# XGBoost regressor model

In [None]:
#import sys
#!brew install libomp && {sys.executable} -m pip install --no-cache-dir xgboost


## 1) X selection and train-test splitting

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Step 1: Load CSV
df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")

def prepare_features_and_target(df):
    """
    Drops unwanted columns and returns X_train, X_test, y_train, y_test.
    """
    # Columns to drop
    columns_to_drop = [
        "price (€)",
        "property_ID",
        "locality_name",
        "postal_code",
        # Note: "price_per_square_meter" is not included in current CSV
    ]
    
    # 1. Create X and y
    X = df.drop(columns=columns_to_drop)
    y = df["price (€)"]
    
    # 2. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=777
    )
    
    return X_train, X_test, y_train, y_test

import numpy as np

def log_transform_target(y_train, y_test):
    """
    Apply log1p transform to target and return transformed versions.
    Also returns the inverse_transform function needed after prediction.
    """
    
    # transform (log1p handles zero safely)
    y_train_log = np.log1p(y_train)
    y_test_log = np.log1p(y_test)

    # define inverse transform
    def inverse_log_transform(y_pred_log):
        return np.expm1(y_pred_log)

    return y_train_log, y_test_log, inverse_log_transform


## 2) Imputation for numerical and categorical columns to handle NaN

In [10]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Function to impute numerical columns
def impute_numeric_columns(X_train, X_test):
    """
    Imputes numerical columns in X_train and X_test using the mean of X_train.
    Returns the updated X_train, X_test, and the list of numerical columns.
    """

    # Select numerical columns automatically
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

    # Create imputer (mean strategy)
    imputer = SimpleImputer(strategy="mean")

    # Fit on training data
    X_train[num_cols] = imputer.fit_transform(X_train[num_cols])

    # Transform test data using SAME means
    X_test[num_cols] = imputer.transform(X_test[num_cols])

    return X_train, X_test, num_cols

# Function to impute categorical column "state of building"
def impute_categorical_state(X_train, X_test, column="state_of_building"):
    """
    Fills missing values in the state_of_building column using the 'unknown' category.
    """
    X_train[column] = X_train[column].fillna("unknown")
    X_test[column] = X_test[column].fillna("unknown")
    
    return X_train, X_test



## 3) Encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

def label_encode_columns(X_train, X_test, columns):
    """
    Label-encodes multiple categorical columns using separate LabelEncoders.
    No data leakage: fits only on X_train.
    Returns transformed X_train, X_test, and a dictionary of encoders.
    """
    encoders = {}
    
    for col in columns:
        le = LabelEncoder()
        
        X_train[col + "_le"] = le.fit_transform(X_train[col])
        X_test[col + "_le"] = le.transform(X_test[col])
        
        encoders[col] = le  # store encoder
        
    return X_train, X_test, encoders

from sklearn.preprocessing import OrdinalEncoder

def ordinal_encode_state(X_train, X_test, column="state_of_building"):
    """
    Applies Ordinal Encoding to the state_of_building column using a fixed custom order.
    Fits only on X_train (no leakage).
    """
    
    state_order = [[
        "unknown",
        "To demolish",
        "Under construction",
        "To restore",
        "To renovate",
        "To be renovated",
        "Normal",
        "Fully renovated",
        "Excellent",
        "New"
    ]]
    
    oe = OrdinalEncoder(categories=state_order)
    
    train_encoded = oe.fit_transform(X_train[[column]])
    test_encoded = oe.transform(X_test[[column]])
    
    X_train[column + "_oe"] = train_encoded.flatten()
    X_test[column + "_oe"] = test_encoded.flatten()
    
    return X_train, X_test, oe



## 4) Final assembly

In [12]:
# Step 1: Load CSV
#df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")

# Step 2: Prepare X and y
X_train, X_test, y_train, y_test = prepare_features_and_target(df)

# Step 3: Apply log transform 
y_train_log, y_test_log, inverse = log_transform_target(y_train, y_test)

# Step 4: Impute numerical values
X_train, X_test, num_cols = impute_numeric_columns(X_train, X_test)

# Step 5: Impute categorical missing values
X_train, X_test = impute_categorical_state(X_train, X_test)

# Step 6: LabelEncoding for "type", "subtype" and "province" columns
X_train, X_test, le_encoders = label_encode_columns(
    X_train,
    X_test,
    columns=["type", "province", "subtype"]
)

# Step 6: OrdinalEncoding for "state_of_building" column
X_train, X_test, state_encoder = ordinal_encode_state(X_train, X_test, column="state_of_building")

# Step 7: Assemble everything together and drop the original columns from before encoding
X_train_final = X_train.drop(columns=["type", "subtype", "state_of_building", "province"])
X_test_final  = X_test.drop(columns=["type", "subtype", "state_of_building", "province"])

# y_train & y_test remain unchanged
y_train = y_train
y_test = y_test

# Optional: quick safety check
print("X_train_final shape:", X_train_final.shape)
print("X_test_final shape: ", X_test_final.shape)


X_train_final shape: (11636, 14)
X_test_final shape:  (2909, 14)


## 5) Training and evalution

In [13]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Train XGBoost model
def train_xgboost(X_train, y_train):
    model = XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=8,
        subsample=1.0,
        colsample_bytree=0.6,
        random_state=888,
        n_jobs=-1,
        min_child_weight=5
    )
    model.fit(X_train, y_train)
    return model



def evaluate_model(model, 
                   X_train, y_train_log, y_train_real,
                   X_test,  y_test_log,  y_test_real, inverse_):
    
    # Predict in log space
    y_pred_train_log = model.predict(X_train)
    y_pred_test_log = model.predict(X_test)
    
    # Convert back to real price scale
    y_pred_train = inverse_(y_pred_train_log)
    y_pred_test = inverse_(y_pred_test_log)

    metrics = {
        "train": {
            "MAE": mean_absolute_error(y_train_real, y_pred_train),
            "RMSE": np.sqrt(mean_squared_error(y_train_real, y_pred_train)),
            "R2": r2_score(y_train_real, y_pred_train)
        },
        "test": {
            "MAE": mean_absolute_error(y_test_real, y_pred_test),
            "RMSE": np.sqrt(mean_squared_error(y_test_real, y_pred_test)),
            "R2": r2_score(y_test_real, y_pred_test)
        }
    }
    return metrics

# Train XGBoost
model_xgb = train_xgboost(X_train_final, y_train_log)

# Evaluate
results_xgb = evaluate_model(
    model_xgb,
    X_train_final, y_train_log, y_train,
    X_test_final, y_test_log,y_test,
    inverse
)

results_xgb



{'train': {'MAE': 42328.98828125,
  'RMSE': np.float64(72831.99648506142),
  'R2': 0.9167090654373169},
 'test': {'MAE': 74293.2734375,
  'RMSE': np.float64(218914.8510266035),
  'R2': 0.5255386829376221}}

## Parameter tuning XGBoost with RandomizedSearchCV

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Parameter space
param_dist = {
    "n_estimators": [200, 400, 600, 800],
    "max_depth": [3, 4, 5, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 3, 5, 7]
}

xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=25,
    cv=3,
    scoring="r2",
    verbose=1,
    n_jobs=-1
)

search.fit(X_train_final, y_train)

print("Best R2:", search.best_score_)
print("Best params:", search.best_params_)
best_xgb = search.best_estimator_


Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best R2: 0.69801926612854
Best params: {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


Conclusion: the XGBoost model is overfitting: it's underperforming on the test set.   
Ways to make improvements to the model:   
* Regularization techniques are needed to get better results.
* Adding the locality and/or postal code parameter could help.
* Use different encoders for province, subtype and type (use OneHotEncoder and TargetEncoder instead).

Article to further explore regularization:
https://medium.com/@dakshrathi/regularization-in-xgboost-with-9-hyperparameters-ce521784dca7