#**Conjunto de datos**

In [None]:
from sklearn.datasets import fetch_california_housing

#**Librerías**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
california = fetch_california_housing()

In [None]:
df_california = pd.DataFrame(data = california.data, columns = california.feature_names)
df_california['MedHouseVal'] = california.target

#**EDA**

In [None]:
df_california.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
df_california.shape

(20640, 9)

In [None]:
df_california.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


In [None]:
df_california.duplicated().sum()

0

#**Modelo con Decision Tree**

#**Divisón de los datos**

In [None]:
X = df_california[['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup']]
y = df_california['MedHouseVal']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)


#**Generación del modelo**

In [None]:
params_tree = {'max_depth': range(2,11),
               'min_samples_leaf': np.arange(0.05,0.4,0.05)}
dt = DecisionTreeRegressor(random_state = 42)
decision_tree_rs_cv = RandomizedSearchCV(dt, param_distributions = params_tree,cv = 10,n_iter = 70,n_jobs = -1,refit = True,scoring = 'neg_mean_squared_error')
decision_tree_rs_cv.fit(X_train,y_train)
predicciones_tree = decision_tree_rs_cv.predict(X_test)
mse_tree = mean_squared_error(y_test,predicciones_tree)
rmse_tree = np.sqrt(mse_tree)
print(f'Root Mean Squared error con validación cruzada para decision tree: {rmse_tree}')

Root Mean Squared error con validación cruzada para decision tree: 0.7783638502518164


#**Random Forest Regressor**

#**Debo medir error de validación, error de entrenamiento y error de test**

In [None]:
params_rf = {'n_estimators': np.arange(100,1000,100),
             'max_depth': range(2,5),
             'min_samples_leaf': np.arange(0.05,0.2,0.02)}

rf = RandomForestRegressor(random_state = 42)
rf_rs_cv = RandomizedSearchCV(rf,param_distributions = params_rf,cv = 4,n_iter = 20,n_jobs = -1,refit = True,scoring = 'neg_mean_squared_error')
rf_rs_cv.fit(X_train,y_train)
predicciones_rf = rf_rs_cv.predict(X_test)
mse_rf = mean_squared_error(y_test,predicciones_rf)
rmse_rf = np.sqrt(mse_rf)
print(f'Mean Squared Error de Random Forest: {rmse_rf}')

Mean Squared Error de Random Forest: 0.8000937686549234


#**GradientBoosting**

In [None]:
params_sgbt = {'max_depth': range(1,4),
               'subsample': np.arange(0.6,0.9,0.1),
               'max_features': ['auto','sqrt','log2',None],
               'n_estimators': np.arange(100,1000,100)}

sgbt = GradientBoostingRegressor(random_state = 42)
sgbt_rs_cv = RandomizedSearchCV(sgbt,param_distributions = params_sgbt,cv = 4,n_iter = 25,n_jobs = -1,refit = True,scoring = 'neg_mean_squared_error')
sgbt_rs_cv.fit(X_train,y_train)
predicciones_sgbt = sgbt_rs_cv.predict(X_test)
mse_sgbt = mean_squared_error(y_test,predicciones_sgbt)
rmse_sgbt = np.sqrt(mse_sgbt)
print(f'Root Mean Squared Error para Gradient Boosting: {rmse_sgbt}')

Root Mean Squared Error para Gradient Boosting: 0.654335294728731


#**ExtremeGradientBoosting**

In [None]:
Dm_train = xgb.DMatrix(data = X_train, label = y_train)
Dm_test = xgb.DMatrix(data = X_test,label = y_test)

param_grid = {'learning_rate':np.arange(0.05,1.05,0.05),
              'n_estimators': np.arange(100,1000,100),
              'subsample': np.arange(0.05,1.05,0.05),
              'alpha':     np.arange(0,1.1,0.1),
              'gamma':     np.arange(0,1.1,0.1),
              'lambda':    np.arange(0.1,1.1,0.1),
              'max_depth': np.arange(1,7,1),
              'colsample_bytree': np.arange(0.1,1.1,0.1)
              }

gbm = xgb.XGBRegressor(random_state = 42,objective = 'reg:squarederror')
xgb_rs_cv = RandomizedSearchCV(estimator = gbm,param_distributions = param_grid,n_iter = 100,cv = 4,n_jobs = -1,refit = True,scoring = 'neg_mean_squared_error',verbose = 1)
xgb_rs_cv.fit(X_train,y_train)
print("Mejores parámetros encontrados: ",xgb_rs_cv.best_params_)
print("RMSE Más bajo encontrado en el conjunto de validación: ",np.sqrt(np.abs(xgb_rs_cv.best_score_)))

predicciones_xgb = xgb_rs_cv.predict(X_test)
mse_xgb = mean_squared_error(y_test,predicciones_xgb)
rmse_xgb = np.sqrt(mse_xgb)

print(f'Root Mean Squared Error para Xtreme Gradient Boosting: {rmse_xgb}')

Fitting 4 folds for each of 100 candidates, totalling 400 fits
Mejores parámetros encontrados:  {'subsample': 0.25, 'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.05, 'lambda': 0.4, 'gamma': 0.0, 'colsample_bytree': 1.0, 'alpha': 0.6000000000000001}
RMSE Más bajo encontrado en el conjunto de validación:  0.6389235329607558
Root Mean Squared Error para Xtreme Gradient Boosting: 0.6463068011555323
