In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, mean_absolute_error, accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

### XGBoost Model for rental prices

Probamos a entrenar el modelo primero sin escalar los datos y luego lo repetidos con los datos escalados para ver si de verdad mejoran los resultados

In [2]:
# Cogemos los datos que estan limpios y con las con todas las columnas numericas
df = pd.read_csv("../data/working_data/idealista_data_rental_numerical_20250802.csv")
df

Unnamed: 0,price,propertyType,operation,size,rooms,bathrooms,municipality,district,neighborhood,latitude,longitude,status,newDevelopment,priceByArea,floor,hasLift,hasParkingSpace,isParkingSpaceIncludedInPrice
0,1500,0,0,95.0,2.0,2.0,0,0,0,36.721475,-4.430273,0,0,16.0,3.0,1.0,0,0
1,1000,0,0,30.0,1.0,1.0,0,0,1,36.720219,-4.422324,0,0,33.0,4.0,1.0,0,0
2,1200,0,0,100.0,3.0,1.0,0,0,2,36.722701,-4.431447,0,0,12.0,2.0,1.0,0,0
3,2000,0,0,140.0,3.0,2.0,0,0,1,36.724211,-4.417712,0,0,14.0,2.0,0.0,0,0
4,1300,0,0,75.0,2.0,1.0,0,0,3,36.729643,-4.415604,0,0,17.0,3.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1100,6,0,114.0,0.0,2.0,0,1,49,36.741982,-4.483157,0,0,10.0,0.0,0.0,0,0
2526,5000,6,0,125.0,0.0,2.0,0,0,1,36.720683,-4.420209,0,0,40.0,0.0,0.0,0,0
2527,4000,6,0,110.0,0.0,1.0,3,24,247,36.510893,-4.892511,0,0,36.0,0.0,0.0,0,0
2528,4500,6,0,198.0,0.0,2.0,0,0,1,36.720616,-4.421397,0,0,23.0,0.0,0.0,0,0


In [3]:
X = df.drop(columns=["price"])
Y = df["price"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [7]:
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

In [8]:
# Parámetros a probar (rango medio, no extremo)
param_grid = {
    'n_estimators': [100, 200, 300],          # Número de árboles
    'max_depth': [3, 5, 7],                   # Profundidad del árbol
    'learning_rate': [0.05, 0.1, 0.2],        # Tamaño de paso
    'subsample': [0.8, 1.0],                  # Proporción de datos para cada árbol
    'colsample_bytree': [0.8, 1.0]            # Proporción de features para cada árbol
}

# Modelo base
model = XGBRegressor(random_state=42, objective='reg:squarederror', verbosity=0)

# Validación cruzada
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring={'MAE': mae_scorer, 'R2': r2_scorer},
    refit='MAE',   # Ajusta el mejor modelo según el menor MAE
    verbose=2,
    n_jobs=-1
)

# Entrenamos
grid_search.fit(X_train, Y_train)

# Resultados
print("\nMejores parámetros encontrados:")
print(grid_search.best_params_)
print("Mejor MAE (negativo):", grid_search.best_score_)

# También vemos el R2 correspondiente a esos parámetros
best_index = grid_search.best_index_
print("R2 en validación cruzada:", grid_search.cv_results_['mean_test_R2'][best_index])

# Evaluamos en test
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)

test_mae = mean_absolute_error(Y_test, Y_pred)
test_r2 = r2_score(Y_test, Y_pred)

print("\nResultados en conjunto de test:")
print(f"MAE test: {test_mae:.2f}")
print(f"R² test: {test_r2:.2f}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits

Mejores parámetros encontrados:
{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
Mejor MAE (negativo): -65.09673767089843
R2 en validación cruzada: 0.9831717729568481

Resultados en conjunto de test:
MAE test: 54.12
R² test: 0.99


In [9]:
import joblib

joblib.dump(best_model, "../models/XGB_rent_model_20250802.pkl")

['../models/XGB_rent_model_20250802.pkl']

Ahora vemos el modelo con los datos escalados

In [10]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled, X_test_scaled, Y_train_scaled, Y_test_scaled = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [11]:
# Parámetros a probar (rango medio, no extremo)
param_grid = {
    'n_estimators': [100, 200, 300],          # Número de árboles
    'max_depth': [3, 5, 7],                   # Profundidad del árbol
    'learning_rate': [0.05, 0.1, 0.2],        # Tamaño de paso
    'subsample': [0.8, 1.0],                  # Proporción de datos para cada árbol
    'colsample_bytree': [0.8, 1.0]            # Proporción de features para cada árbol
}

# Modelo base
model = XGBRegressor(random_state=42, objective='reg:squarederror', verbosity=0)

# Validación cruzada
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring={'MAE': mae_scorer, 'R2': r2_scorer},
    refit='MAE',   # Ajusta el mejor modelo según el menor MAE
    verbose=2,
    n_jobs=-1
)

# Entrenamos
grid_search.fit(X_train_scaled, Y_train_scaled)

# Resultados
print("\nMejores parámetros encontrados:")
print(grid_search.best_params_)
print("Mejor MAE (negativo):", grid_search.best_score_)

# También vemos el R2 correspondiente a esos parámetros
best_index = grid_search.best_index_
print("R2 en validación cruzada:", grid_search.cv_results_['mean_test_R2'][best_index])

# Evaluamos en test
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test_scaled)

test_mae = mean_absolute_error(Y_test_scaled, Y_pred)
test_r2 = r2_score(Y_test_scaled, Y_pred)

print("\nResultados en conjunto de test:")
print(f"MAE test: {test_mae:.2f}")
print(f"R² test: {test_r2:.2f}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits

Mejores parámetros encontrados:
{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
Mejor MAE (negativo): -65.13280410766602
R2 en validación cruzada: 0.9831630945205688

Resultados en conjunto de test:
MAE test: 53.97
R² test: 0.99


In [12]:
joblib.dump(best_model, "../models/XGB_rent_model_scaled_20250802.pkl")

['../models/XGB_rent_model_scaled_20250802.pkl']

### Linear Regression Model for sales prices

In [13]:
df = pd.read_csv("../data/working_data/idealista_data_sales_numerical_20250802.csv")
df

Unnamed: 0,price,propertyType,operation,size,rooms,bathrooms,municipality,district,neighborhood,latitude,longitude,status,newDevelopment,priceByArea,floor,hasLift,hasParkingSpace,isParkingSpaceIncludedInPrice
0,985000,3,1,245.0,4.0,3.0,0,2,47,36.726046,-4.392859,0,0,4020.0,0.0,0.0,1,1
1,265000,0,1,66.0,2.0,1.0,0,3,20,36.703176,-4.438555,0,0,4015.0,8.0,1.0,0,0
2,275000,0,1,82.0,2.0,2.0,0,0,39,36.731530,-4.414621,0,0,3354.0,1.0,1.0,0,0
3,1600000,0,1,174.0,3.0,2.0,0,2,7,36.721393,-4.405125,0,0,9195.0,1.0,1.0,1,1
4,550000,0,1,143.0,3.0,2.0,0,0,1,36.720695,-4.421476,0,0,3846.0,3.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2654,9000,8,1,12.5,0.0,0.0,12,43,103,36.508218,-4.725346,3,0,300.0,-1.0,1.0,0,0
2655,22000,8,1,12.5,0.0,0.0,13,32,86,36.590990,-4.535055,3,0,815.0,0.0,0.0,0,0
2656,8000,8,1,12.5,0.0,0.0,14,0,88,36.631383,-4.502652,3,0,276.0,-1.0,0.0,0,0
2657,9900,8,1,12.5,0.0,0.0,13,34,156,36.581868,-4.551639,3,0,381.0,-1.0,1.0,0,0


In [14]:
X = df.drop(columns=["price"])
Y = df["price"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
# Scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Parámetros a probar (rango medio, no extremo)
param_grid = {
    'n_estimators': [100, 200, 300],          # Número de árboles
    'max_depth': [3, 5, 7],                   # Profundidad del árbol
    'learning_rate': [0.05, 0.1, 0.2],        # Tamaño de paso
    'subsample': [0.8, 1.0],                  # Proporción de datos para cada árbol
    'colsample_bytree': [0.8, 1.0]            # Proporción de features para cada árbol
}

# Modelo base
model = XGBRegressor(random_state=42, objective='reg:squarederror', verbosity=0)

# Validación cruzada
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring={'MAE': mae_scorer, 'R2': r2_scorer},
    refit='MAE',   # Ajusta el mejor modelo según el menor MAE
    verbose=2,
    n_jobs=-1
)

# Entrenamos
grid_search.fit(X_train, Y_train)

# Resultados
print("\nMejores parámetros encontrados:")
print(grid_search.best_params_)
print("Mejor MAE (negativo):", grid_search.best_score_)

# También vemos el R2 correspondiente a esos parámetros
best_index = grid_search.best_index_
print("R2 en validación cruzada:", grid_search.cv_results_['mean_test_R2'][best_index])

# Evaluamos en test
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)

test_mae = mean_absolute_error(Y_test, Y_pred)
test_r2 = r2_score(Y_test, Y_pred)

print("\nResultados en conjunto de test:")
print(f"MAE test: {test_mae:.2f}")
print(f"R² test: {test_r2:.2f}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits



Mejores parámetros encontrados:
{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
Mejor MAE (negativo): -15741.211328125
R2 en validación cruzada: 0.9917310833930969

Resultados en conjunto de test:
MAE test: 15764.19
R² test: 0.99


In [16]:
joblib.dump(best_model, "../models/XGB_sales_model_20250802.pkl")

['../models/XGB_sales_model_20250802.pkl']