# Predicción del precio de la vivienda mediante aprendizaje automático

## Trabajo Final De Master

## Universidad de Valladolid

#### Bungisa Beto Bibeyi

### Desarrollo de un Modelo de Predicción para el Precio de Viviendas

### Librerias

In [75]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [76]:
# Modulos
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [77]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [78]:
import re

In [79]:
from sklearn.preprocessing import LabelEncoder

In [80]:
from sklearn.base import BaseEstimator, TransformerMixin

In [81]:
import joblib
from datetime import datetime

In [82]:
from sklearn.model_selection import train_test_split, cross_val_score,StratifiedKFold

In [83]:
from sklearn.linear_model import LinearRegression

In [84]:
from sklearn.ensemble import RandomForestRegressor

In [85]:
from sklearn.tree import DecisionTreeRegressor

In [86]:
from sklearn.model_selection import GridSearchCV

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBRegressor

### FUNCIONES

In [90]:
def crear_modelo(X, y,classifiers,preprocessor):
    """
    Evaluar varios clasificadores en un conjunto de datos.
    """
    results = []
    for name, model in classifiers.items():

        print(f"Iniciar entrenamiento y evaluacion modelo '{name}'")
        # Crear el pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifiers', model)
        ])

        try:

            # Evaluar el modelo usando validación cruzada
            scores_MSE = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
            scores_RMSE = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
            scores_R2 = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

            # cambiar el signo
            scores_model_MSE = np.sqrt(-scores_MSE)
            mean_score_MSE = scores_model_MSE.mean()
            std_MSE = scores_model_MSE.std()
            
            scores_model_RMSE = np.sqrt(-scores_RMSE)
            mean_score_RMSE = scores_model_RMSE.mean()
            std_RMSE = scores_model_RMSE.std()
            
            
            
       
            mean_score_R2 = np.mean(scores_R2)
            std_R2 =np.std(scores_R2)

            print(f'{name} - Mean score MSE: {mean_score_MSE:.4f}')
            print(f'{name} - score MSE: {scores_model_RMSE}')
            print(f'{name} - Standard deviation MSE: {std_MSE}')
            
            print(f'{name} - Mean score RMSE: {mean_score_RMSE:.4f}')
            print(f'{name} - score RMSE: {scores_model_RMSE}')
            print(f'{name} - Standard deviation RMSE: {std_RMSE}')
            
            
            print(f'{name} - Mean score R2: {mean_score_R2:.4f}')
            print(f'{name} - score R2: {scores_R2}')
            print(f'{name} - Standard deviation R2: {std_R2}')
            
            pipeline.fit(X,y)

            timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
            filename = f'modelos/{name}-{timestamp}.pkl'
            #filename = f'modelos/pruebaReg_1.pkl'

            # Guardar el pipeline entrenado
            joblib.dump(pipeline, filename)
            print(f"Modelo guardado con éxito en '{filename}'")

            results.append({
                'Modelo': name,
                'ruta':filename,
                'score MSE':scores_MSE,
                'Mean Negative MSE': mean_score_MSE,
                'Standard Deviation MSE': std_MSE,
                'score RMSE':scores_RMSE,
                'Mean Negative RMSE': mean_score_RMSE,
                'Standard Deviation RMSE': std_RMSE,
                'score R2':scores_R2,
                'Mean Negative R2': mean_score_R2,
                'Standard Deviation R2': std_R2
            })
            
            results_df = pd.DataFrame(results)
            results_df.to_csv(f'resultados/resultados_modelos_{timestamp}.csv', index=False)

            print(f"Fin entrenamiento y evaluacion modelo '{name}'")

        except ValueError as e:
            print(f'Error durante la validación cruzada: {e}')
        except Exception as e:
            print(f'Error inesperado: {e}')


### Formato conjunto de datos (CSV)

### Descripción de los campos

### Lectura de los datos

In [91]:
viviendas_df = pd.read_csv('datos_procesados/viviendas_procesadas_df.csv')

### Exploracion inicial de los datos

La estructura del fichero json leido es heterogeneos o variable por lo que los objetos pueden tener diferentes conjuntos
claves o variar en los tipos de valores asociados a estas claves.  
A la hora de cargar los datos se rellenan con valores nulos(NaN) los campos que no estan presentes en todos los objetos.  

En esta fase se realizar una observacion inicial sobre el conjunto de datos donde se puede determinar lo siguiente:

 - El numero de registros y columnas que tiene el conjunto de datos
 - Como se presentan los datos y si se han cargado correctamente
 - Renombrar el nombre de las columnas si se considera necesario para mejorar el analisis del conjunto de datos

In [92]:
viviendas_df.shape

(3256, 17)

##### Observaciones:
 - Podemos ver que el conjunto de datos cargado consta de 4517 registros y 23 columnas

#### Información del conjunto de datos

In [93]:
viviendas_df[viviendas_df.columns[0:18]].head()

Unnamed: 0,price,propertyType,size,rooms,bathrooms,municipality,latitude,longitude,status,priceByArea,hasParkingSpace,isParkingSpaceIncludedInPrice,typology,subtitle,floor,exterior,hasLift
0,795000.0,flat,201.0,2,2,madrid,40.444223,-3.642674,newdevelopment,3955.0,1,1,flat,concepción,intermedio,1,1
1,330000.0,flat,65.0,1,1,madrid,40.419601,-3.704022,good,5077.0,0,0,flat,sol,intermedio,0,1
2,170000.0,chalet,110.0,3,1,campo real,40.339953,-3.382293,good,1545.0,0,0,chalet,campo real,alto,1,0
3,945000.0,penthouse,185.0,3,3,madrid,40.436027,-3.695607,good,5108.0,0,0,flat,almagro,mediano,1,1
4,450000.0,flat,110.0,3,2,villaviciosa de odón,40.362803,-3.914905,renew,4091.0,1,1,flat,centro,intermedio,1,1


#### Visión general de la estructura de datos

In [94]:
viviendas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3256 entries, 0 to 3255
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   price                          3256 non-null   float64
 1   propertyType                   3256 non-null   object 
 2   size                           3256 non-null   float64
 3   rooms                          3256 non-null   int64  
 4   bathrooms                      3256 non-null   int64  
 5   municipality                   3256 non-null   object 
 6   latitude                       3256 non-null   float64
 7   longitude                      3256 non-null   float64
 8   status                         3256 non-null   object 
 9   priceByArea                    3256 non-null   float64
 10  hasParkingSpace                3256 non-null   int64  
 11  isParkingSpaceIncludedInPrice  3256 non-null   int64  
 12  typology                       3256 non-null   o

Se procede a separar el conjuntos de datos en variables categorico y numericos

Los modelos de aprendizaje automatico solo pueden procesar valores numericos por lo que cuando en conjunto de datos se tiene variables categoricas es necesario realizar un proceso de transformacion de esos valores textuales en numeros.
Embedins , label encoding(puede ser problematico con cierto modelo de machine learning)

### Separar variables categoricas y numericas

In [96]:
columnas_numericas_df = viviendas_df.select_dtypes(include=[np.number])
columnas_numericas = columnas_numericas_df.drop(['price'],axis=1).columns

columnas_categoricas_df = viviendas_df.select_dtypes(exclude=[np.number])
columnas_categoricas = columnas_categoricas_df.columns

### Procesamiento

In [97]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), columnas_categoricas),
        ('num', StandardScaler(), columnas_numericas)
    ]
)

### Clasificadores

In [98]:
## Clasificadores
classifiersLinearRegression = {
    'LinearRegression': LinearRegression()
}
classifiersRandomForestRegressor = {
    'RandomForestRegressor':RandomForestRegressor(n_estimators=10, random_state=42)
}

classifiersDecisionTreeRegressor = {
   'DecisionTreeRegressor':DecisionTreeRegressor(random_state=42)
}

classifiersXGBRegressor = {
   'XGBRegressor':XGBRegressor(objective='reg:squarederror', use_label_encoder=False)
}

### Separar en variables predictoras y variable objetivo

In [100]:
X=viviendas_df.drop(['price'],axis=1)

In [101]:
y= viviendas_df['price']

### Entrenamiento y validación

In [105]:
crear_modelo(X, y,classifiersLinearRegression,preprocessor)

Iniciar entrenamiento y evaluacion modelo 'LinearRegression'
LinearRegression - Mean score MSE: 74485.9330
LinearRegression - score MSE: [ 64867.003387    64592.24609207  62483.95031023  63279.13266551
 117207.33256341]
LinearRegression - Standard deviation MSE: 21378.33852483209
LinearRegression - Mean score RMSE: 74485.9330
LinearRegression - score RMSE: [ 64867.003387    64592.24609207  62483.95031023  63279.13266551
 117207.33256341]
LinearRegression - Standard deviation RMSE: 21378.33852483209
LinearRegression - Mean score R2: 0.7876
LinearRegression - score R2: [0.78124231 0.81470225 0.79304999 0.75209854 0.79686991]
LinearRegression - Standard deviation R2: 0.020740970416607604
Modelo guardado con éxito en 'modelos/LinearRegression-20240825170625.pkl'
Fin entrenamiento y evaluacion modelo 'LinearRegression'


In [106]:
crear_modelo(X, y,classifiersRandomForestRegressor,preprocessor)

Iniciar entrenamiento y evaluacion modelo 'RandomForestRegressor'
RandomForestRegressor - Mean score MSE: 20987.3024
RandomForestRegressor - score MSE: [10763.46183464  9754.72153054  9690.21084311  9182.55867433
 65545.55903587]
RandomForestRegressor - Standard deviation MSE: 22285.02759880139
RandomForestRegressor - Mean score RMSE: 20987.3024
RandomForestRegressor - score RMSE: [10763.46183464  9754.72153054  9690.21084311  9182.55867433
 65545.55903587]
RandomForestRegressor - Standard deviation RMSE: 22285.02759880139
RandomForestRegressor - Mean score R2: 0.9832
RandomForestRegressor - score R2: [0.99397691 0.99577391 0.99502269 0.99477982 0.93647404]
RandomForestRegressor - Standard deviation R2: 0.023372754263656193
Modelo guardado con éxito en 'modelos/RandomForestRegressor-20240825170646.pkl'
Fin entrenamiento y evaluacion modelo 'RandomForestRegressor'


In [107]:
crear_modelo(X, y,classifiersDecisionTreeRegressor,preprocessor)

Iniciar entrenamiento y evaluacion modelo 'DecisionTreeRegressor'
DecisionTreeRegressor - Mean score MSE: 26325.8654
DecisionTreeRegressor - score MSE: [16334.16472332 14415.08618898 15846.36226215 16753.12285418
 68280.59101614]
DecisionTreeRegressor - Standard deviation MSE: 20992.175357577355
DecisionTreeRegressor - Mean score RMSE: 26325.8654
DecisionTreeRegressor - score RMSE: [16334.16472332 14415.08618898 15846.36226215 16753.12285418
 68280.59101614]
DecisionTreeRegressor - Standard deviation RMSE: 20992.175357577355
DecisionTreeRegressor - Mean score R2: 0.9755
DecisionTreeRegressor - score R2: [0.98612894 0.99077123 0.98668971 0.98262401 0.93106191]
DecisionTreeRegressor - Standard deviation R2: 0.02234672048276261
Modelo guardado con éxito en 'modelos/DecisionTreeRegressor-20240825170652.pkl'
Fin entrenamiento y evaluacion modelo 'DecisionTreeRegressor'


In [108]:
crear_modelo(X, y,classifiersXGBRegressor,preprocessor)

Iniciar entrenamiento y evaluacion modelo 'XGBRegressor'


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBRegressor - Mean score MSE: 20167.8083
XGBRegressor - score MSE: [10910.26391947  9982.3402112  10273.18809374 10806.26274167
 58866.98653417]
XGBRegressor - Standard deviation MSE: 19352.593254717263
XGBRegressor - Mean score RMSE: 20167.8083
XGBRegressor - score RMSE: [10910.26391947  9982.3402112  10273.18809374 10806.26274167
 58866.98653417]
XGBRegressor - Standard deviation RMSE: 19352.593254717263
XGBRegressor - Mean score R2: 0.9851
XGBRegressor - score R2: [0.99381149 0.99557438 0.99440579 0.99277049 0.9487601 ]
XGBRegressor - Standard deviation R2: 0.01817481216646054
Modelo guardado con éxito en 'modelos/XGBRegressor-20240825170701.pkl'
Fin entrenamiento y evaluacion modelo 'XGBRegressor'


### Optimización

In [111]:
param_grid = {
    'xgb__n_estimators': [100, 200, 300],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 6, 9],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}

pipelineXGBReg = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(objective='reg:squarederror', use_label_encoder=False))
])


grid_search = GridSearchCV(estimator=pipelineXGBReg,
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5, 
                           n_jobs=-1,  
                           verbose=1)

In [110]:
grid_search.fit(X, y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


AttributeError: 'dict' object has no attribute 'set_params'

In [363]:
grid_search.best_params_

{'xgb__colsample_bytree': 0.8,
 'xgb__learning_rate': 0.1,
 'xgb__max_depth': 6,
 'xgb__n_estimators': 300,
 'xgb__subsample': 0.8}

In [364]:
classifiersXGBRegressorRend = {
    'XGBRegressor':XGBRegressor(objective='reg:squarederror', use_label_encoder=False,colsample_bytree=0.8,learning_rate=0.1,max_depth=6, n_estimators=300,subsample=0.8  )
}

In [375]:
crear_modelo(X, y,classifiersXGBRegressorRend,preprocessor)

Iniciar entrenamiento y evaluacion modelo 'XGBRegressor'


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBRegressor - Mean score MSE: 65724.9737
XGBRegressor - score MSE: [ 51712.39162113  55442.88796286  52384.13648702  54658.29609591
 114427.15641195]
XGBRegressor - Standard deviation MSE: 24390.26705580355
XGBRegressor - Mean score RMSE: 65724.9737
XGBRegressor - score RMSE: [ 51712.39162113  55442.88796286  52384.13648702  54658.29609591
 114427.15641195]
XGBRegressor - Standard deviation RMSE: 24390.26705580355
XGBRegressor - Mean score R2: 0.8477
XGBRegressor - score R2: [0.86861926 0.86592342 0.85439515 0.82355057 0.82589554]
XGBRegressor - Standard deviation R2: 0.019355416250685393
Modelo guardado con éxito en 'modelos/XGBRegressor-20240825042128.pkl'
Fin entrenamiento y evaluacion modelo 'XGBRegressor'


### Prueba

In [71]:
# Ruta al archivo del modelo
modelo_path = 'modelos/XGBRegressor-20240825135344.pkl'

# Cargar el modelo
modelo = joblib.load(modelo_path)

In [72]:
y_pred = modelo.predict(X_test)

In [73]:
results_df = X_test
results_df['price'] = y_test.values
results_df['price_predict'] = y_pred

In [74]:
results_df.head()

Unnamed: 0,propertyType,size,rooms,bathrooms,municipality,latitude,longitude,status,priceByArea,hasParkingSpace,isParkingSpaceIncludedInPrice,typology,subtitle,floor,hasLift,price,price_predict
332,flat,95.0,4,2,getafe,40.303934,-3.728678,good,2684.0,0,0,flat,getafe centro,bajo,1,255000.0,251993.453125
2764,flat,45.0,2,1,madrid,40.454427,-3.700498,good,5222.0,0,0,flat,cuatro caminos,intermedio,0,235000.0,235509.078125
1659,chalet,111.0,3,2,fuente el saz de jarama,40.631483,-3.518362,good,3318.0,1,1,chalet,fuente el saz de jarama,alto,0,368245.0,368662.375
1161,penthouse,55.0,1,1,paracuellos de jarama,40.504679,-3.531509,good,3073.0,0,0,flat,casco urbano,intermedio,0,169000.0,169130.40625
3000,flat,139.0,3,1,estremera,40.181652,-3.107211,good,281.0,0,0,flat,estremera,flat,0,39000.0,40341.140625
