# Datos inmobiliarios de California

In [1]:
# Cargamos librerías a utilisar
import numpy as np # librería numérica
import pandas as pd # librería para marcos de datos

In [2]:
# Cargamos datos inmobiliarios de Caliornia
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [3]:
# Generamos marcos de datos para el problema de regresión
X = pd.DataFrame(housing.data,columns = housing.feature_names)
y = pd.DataFrame(housing.target,columns = housing.target_names)

In [4]:
# Observamos las variables predictoras / features
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
# Hay 20640 observaciones
X.shape

(20640, 8)

In [None]:
# Observamos las variables respuesta
y.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [5]:
# Cargamos librería para estandarizar los datos
from sklearn.preprocessing import StandardScaler

In [6]:
# Estandarizamos los features
feat_scaler = StandardScaler()# Resta la media y divide por la desviación estándar
feat_scaler.fit(X)

In [7]:
X = feat_scaler.transform(X)

In [8]:
# Estandarizamos las responses
resp_scaler = StandardScaler()
resp_scaler.fit(y)

In [9]:
y = resp_scaler.transform(y)

In [10]:
# Dividimos los datos para entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 111)

In [11]:
# Hay 15480 observaciones para entrenamiento
X_train.shape

(15480, 8)

# Modelos lineales regularizados/penalizados

In [12]:
# Corremos regresión lineal
from sklearn.linear_model import LinearRegression
lin_reg =LinearRegression()
lin_reg.fit(X_train,y_train)

In [13]:
# Evaluamos el error cuadrático medio en los datos de prueba
from sklearn.metrics import mean_squared_error

In [14]:
y_lin_pred = lin_reg.predict(X_test)
loss_lin = mean_squared_error(y_test,y_lin_pred)
loss_lin

0.40192750313229725

In [18]:
# Corremos rgresión ridge con distintos valores de penalización
from sklearn.linear_model import Ridge
rid_reg = Ridge()
alphas = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]} # Malla para grid search

In [16]:
# y elegimos la penalizacion mediante validación cruzada en los datos de prueba
from sklearn.model_selection import GridSearchCV

In [17]:
rid_reg_cv = GridSearchCV(rid_reg,alphas,scoring='neg_mean_squared_error',cv=10)
rid_reg_cv.fit(X_train,y_train)

In [None]:
# Let's check out the best parameter and the score
print(rid_reg_cv.best_params_) #Checamos cual es el mejor modelo
print(rid_reg_cv.best_score_)

{'alpha': 45}
-0.39606446784925325


In [20]:
rid_reg = rid_reg_cv.best_estimator_

In [21]:
# Evaluamos el error cuadrático medio en los datos de prueba
y_rid_pred = rid_reg.predict(X_test)
loss_rid = mean_squared_error(y_test,y_rid_pred)
loss_rid

0.40271523250867686

In [22]:
# Corremos rgresión Lasso con distintos valores de penalización
# y elegimos la penalización con validación cruzada
from sklearn.linear_model import Lasso
lasso_reg = Lasso()
lasso_reg_cv = GridSearchCV(lasso_reg,alphas,scoring='neg_mean_squared_error',cv=10)
lasso_reg_cv.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [23]:
# Let's check out the best parameter and the score
print(lasso_reg_cv.best_params_)
print(lasso_reg_cv.best_score_)

{'alpha': 0.001}
-0.3958420529296844


In [24]:
lasso_reg = rid_reg_cv.best_estimator_

In [25]:
# Evaluamos el error cuadrático medio en los datos de prueba
y_lasso_pred = lasso_reg.predict(X_test)
loss_lasso = mean_squared_error(y_test,y_lasso_pred)
loss_lasso

0.40271523250867686

In [26]:
# El modelo lineal en este caso tiene el mejor error cuadrático medio
loss_lin < np.minimum(loss_lasso,loss_rid)

True

# Modelos polinomiales regularizados/penalizados

In [None]:
# Añadimos predictores para regresión polinomial
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X = poly.fit_transform(X) # Me añade los terminos polinomiales de orden 2

In [None]:
# Ahoara utilizamos 45 features
X.shape  #Tengo 1081 features

(20640, 1081)

In [31]:
# Dividimos nuevamente los datos para entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 11)

In [32]:
# Corremos regresión polinomial
from sklearn.linear_model import LinearRegression
pol_reg =LinearRegression()
pol_reg.fit(X_train,y_train)

In [33]:
# Evaluamos el error cuadrático medio en los datos de prueba
y_pol_pred = pol_reg.predict(X_test)
loss_pol = mean_squared_error(y_test,y_pol_pred)
loss_pol

4156094.787402679

In [34]:
# Corremos regresión polinomial ridge
# y elegimos la penalizacion mediante validación cruzada en los datos de prueba
rid_reg = Ridge()
rid_pol_reg_cv = GridSearchCV(rid_reg,alphas,scoring='neg_mean_squared_error',cv=10)
rid_pol_reg_cv.fit(X_train,y_train)
rid_pol_reg = rid_pol_reg_cv.best_estimator_

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


KeyboardInterrupt: 

In [None]:
# Evaluamos el error cuadrático medio en los datos de prueba
y_rid_pol_pred = rid_pol_reg.predict(X_test)
loss_rid_pol = mean_squared_error(y_test,y_rid_pol_pred)
loss_rid_pol

0.3590362444357679

In [None]:
# Corremos regresión polinomial Lasso
# y elegimos la penalizacion mediante validación cruzada en los datos de prueba
lasso_reg = Lasso()
lasso_pol_reg_cv = GridSearchCV(lasso_reg,alphas,scoring='neg_mean_squared_error',cv=10)
lasso_pol_reg_cv.fit(X_train,y_train)
lasso_pol_reg = lasso_pol_reg_cv.best_estimator_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [None]:
# Evaluamos el error cuadrático medio en los datos de prueba
y_lasso_pol_pred = lasso_pol_reg.predict(X_test)
loss_lasso_pol = mean_squared_error(y_test,y_lasso_pol_pred)
loss_lasso_pol

0.40428115333570996

In [None]:
loss_rid_pol < loss_lasso_pol < loss_pol

True

In [None]:
loss_rid_pol < loss_lin

True