# Datos inmobiliarios de California

In [2]:
# Cargamos librerías a utilisar
import numpy as np # librería numérica
import pandas as pd # librería para marcos de datos

In [1]:
# Cargamos datos inmobiliarios de Caliornia
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [3]:
# Imprimir descripción de los datos
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [4]:
# Generamos marcos de datos para el problema de regresión
X = pd.DataFrame(housing.data,columns = housing.feature_names)
y = np.ravel(pd.DataFrame(housing.target,columns = housing.target_names))

In [5]:
# Observamos las variables predictoras / features
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
# Hay 20640 observaciones
X.shape

(20640, 8)

In [7]:
# Observamos las variables respuesta
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [8]:
# Dividimos los datos para entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 111)

In [9]:
# Hay 15480 observaciones para entrenamiento
X_train.shape

(15480, 8)

In [10]:
# Cargamos librería para estandarizar los datos
from sklearn.preprocessing import StandardScaler

In [11]:
# Estandarizamos los features
feat_scaler = StandardScaler()
feat_scaler.fit_transform(X_train) # evita data leakage

array([[ 0.16043772, -0.91622667, -0.16397832, ..., -0.11667895,
        -1.32936886,  1.15894041],
       [-0.02140754,  0.59132331,  0.07117651, ..., -0.01990041,
        -0.78712171,  0.7607623 ],
       [ 0.26855049, -1.70967403,  0.45955115, ...,  0.01869872,
         0.97050699, -0.92154022],
       ...,
       [-1.36636426,  0.82935751, -1.37856055, ...,  0.06370241,
        -0.74037627,  0.63135442],
       [ 0.44140398,  0.51197857,  0.07654414, ...,  0.07906183,
        -0.82919261,  0.81053457],
       [ 0.735805  , -1.8683635 ,  0.69647728, ...,  0.03979641,
        -0.78244717,  1.16391764]])

In [12]:
X_test_transf = pd.DataFrame( feat_scaler.transform(X_test), columns = housing.feature_names)

## Árbol de decisión

In [13]:
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor

In [14]:
dectreereg = DecisionTreeRegressor()
dectreereg.fit(X_train,y_train)

In [15]:
from sklearn.metrics import  root_mean_squared_error

In [16]:
# Error en datos de entrenamiento
root_mean_squared_error(y_train,dectreereg.predict(X_train))

2.7314435506056696e-16

In [17]:
# Error en datos de validación
root_mean_squared_error(y_test,dectreereg.predict( X_test_transf ))

1.2537221689140965

# Bosque aleatorio

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
randfrst_reg = RandomForestRegressor()
randfrst_reg.fit(X_train, y_train)

In [21]:
# Error en datos de entrenamiento
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train,randfrst_reg.predict(X_train))

0.03572238137994124

In [22]:
# Error en datos de validación
root_mean_squared_error(y_test,randfrst_reg.predict(X_test_transf ))

1.1615688167815317

In [23]:
# Malla de hiper-parámteros
parameters_rf = {
    'criterion': [ 'squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'n_estimators': [100, 150, 200]
}

In [24]:
from sklearn.model_selection import GridSearchCV

In [30]:
randfrst_grid_search = GridSearchCV(randfrst_reg, parameters_rf, cv = 2, n_jobs = -1, verbose = 10)
# La validación cruzada es tardada, descomentar siguiente línea para correrla
randfrst_grid_search .fit(X_train, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


## Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
boost_reg = GradientBoostingRegressor()
boost_reg.fit(X_train, y_train)

In [None]:
# Error en datos de entrenamiento
root_mean_squared_error(y_train,boost_reg.predict(X_train))

0.5052569639016067

In [None]:
# Error en datos de validación
root_mean_squared_error(y_test,boost_reg.predict(X_test))

0.5399981082486782

In [None]:
boost_reg = GradientBoostingRegressor()
parameters = {
    'loss': [ 'absolute_error', 'squared_error', 'huber'],
    'learning_rate': [ 0.1, 1.0, 5.0],
    'n_estimators': [100, 150, 200]
}

In [None]:
boost_grid_search = GridSearchCV(boost_reg, parameters, cv = 10, n_jobs = -1, verbose = 1)
# La validación cruzada es tardada, descomentar siguiente línea para correrla
boost_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 27 candidates, totalling 270 fits


## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
adaboost_reg = AdaBoostRegressor()
adaboost_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
# Error en datos de entrenamiento
mean_squared_error(y_train,adaboost_reg.predict(X_train))

0.5016060469379447

In [None]:
# Error en datos de prueba
mean_squared_error(y_test,adaboost_reg.predict(X_test))

0.5162751783409123


## XGBoost

In [None]:
import xgboost as xgb

In [None]:
y_train

array([[ 0.30196145],
       [-0.24573294],
       [-0.31332813],
       ...,
       [-0.73623139],
       [-0.20586911],
       [-0.37745691]])

In [None]:
xgbclf = xgb.XGBRegressor()
xgbclf.fit(X_train, y_train)

In [None]:
mean_squared_error(y_train,xgbclf.predict(X_train))

0.052746081492487946

In [None]:
mean_squared_error(y_test,xgbclf.predict(X_test))

0.17011720399112662