#**Conjunto de datos**

In [None]:
from sklearn.datasets import fetch_california_housing

#**Librerías**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

In [None]:
california = fetch_california_housing()

In [None]:
df_california = pd.DataFrame(data = california.data, columns = california.feature_names)
df_california['MedHouseVal'] = california.target

#**EDA**

In [None]:
df_california.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
df_california.shape

(20640, 9)

In [None]:
df_california.corr()  #Aveberdrms con AveRooms tienen una correlación muy alta de 0.847 por lo que se descartará una de estas variables

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [None]:
df_california.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


In [None]:
df_california.duplicated().sum()

0

#**Divisón de los datos**

In [None]:
X = df_california[['MedInc','HouseAge','AveRooms','Population','AveOccup']]
y = df_california['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

#**Escalado de los datos**

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#**Generación del modelo**

In [None]:
regresion = LinearRegression()
regresion.fit(X_train_scaled,y_train)
predicciones = regresion.predict(X_test_scaled)
rmse = mean_squared_error(y_test,predicciones,squared = False)
print(f"Root Mean Squared error sin validación cruzada: {rmse}")


Root Mean Squared error sin validación cruzada: 0.8100228004441534


#**Comprobación para ver si hay overfitting**

In [None]:
prediccion_train = regresion.predict(X_train_scaled)
rmse_train = mean_squared_error(y_train,prediccion_train,squared = False)
print(f'Root Mean Squared error train: {rmse_train}')
print(f"Root Mean Squared error test: {rmse}")

Root Mean Squared error train: 0.8026470696876299
Root Mean Squared error test: 0.8100228004441534


#**Métrica con validación cruzada**

In [None]:
cv_regresion = -cross_val_score(regresion,X_train_scaled,y_train,cv = 10,scoring = 'neg_mean_squared_error')
print(f'Root Mean Squared Error con validación cruzada: {np.mean(cv_regresion)**(1/2)}')

Root Mean Squared Error con validación cruzada: 0.8050205634149324


In [None]:
import os
print(os.cpu_count())


2


#**Regresión Ridge**

In [None]:
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
params = {'alpha': np.logspace(-6,4,30)}

ridge = Ridge()
ridge_rs_cv = RandomizedSearchCV(ridge, param_distributions = params, cv = kf,n_iter = 25
                                  ,n_jobs = -1,scoring = 'neg_mean_squared_error',refit = True)

ridge_rs_cv.fit(X_train_scaled,y_train)
predicciones_ridge = ridge_rs_cv.predict(X_test_scaled)

mse_ridge = mean_squared_error(y_test,predicciones_ridge)
rmse_ridge = np.sqrt(mse_ridge)
print(f'Root Mean Squared Error con validación cruzada para Ridge: {rmse_ridge}')


Root Mean Squared Error con validación cruzada para Ridge: 0.8098681686923181


#**Regresión Lasso**

In [None]:
params_lasso = {'alpha':np.logspace(-6,4,30)}

lasso = Lasso()
lasso_rs_cv = RandomizedSearchCV(lasso, param_distributions = params_lasso,cv = kf, n_iter = 25,
                                 n_jobs = -1,scoring = 'neg_mean_squared_error',refit = True)

lasso_rs_cv.fit(X_train_scaled,y_train)
predicciones_lasso = lasso_rs_cv.predict(X_test_scaled)

mse_lasso = mean_squared_error(y_test,predicciones_lasso)
rmse_lasso = np.sqrt(mse_lasso)
print(f'Root Mean Squared Error con validación cruzada para Lasso: {rmse_lasso}')

Root Mean Squared Error con validación cruzada para Lasso: 0.8101227869714598
