In [1]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


# Se levanta el set de train generado

In [2]:
data_set = pd.read_csv("train_set_xgb.csv")
#X = data_set.loc[:, (data_set.columns != 'precio') & (data_set.columns != 'factor_pm')]
X = data_set.loc[:, (data_set.columns != 'precio')]
y = data_set.loc[:, ['precio']]

In [3]:
data_set.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio',
       'anio_publ'],
      dtype='object')

In [4]:
X.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'anio_publ'],
      dtype='object')

In [5]:
y.columns

Index(['precio'], dtype='object')

# Se separa en train y en test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [7]:
len(X_train)

168000

In [8]:
len(X_test)

72000

# Se define el objeto Bagging

In [9]:
defined_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.5,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='reg:squarederror', reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1.0, verbosity=1)

In [10]:
bag = BaggingRegressor(base_estimator = defined_xgb, bootstrap_features = False, n_jobs = 8, random_state = 0, verbose = 2)

# Busqueda de hiperparametros

In [11]:
params = {
        'n_estimators': [100, 200, 300],
        'max_samples': [0.5, 0.8, 1],
        "max_features" : [0.5, 0.8, 1],
        "bootstrap" : [True, False],
        }


In [12]:
randomized = RandomizedSearchCV(bag, params, n_iter = 10, verbose = 2, n_jobs = 8)
randomized.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed: 121.8min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:  4.7min remaining:  7.8min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  5.8min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  5.8min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=BaggingRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.5,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estim...0, n_estimators=10, n_jobs=8, oob_score=False,
         random_state=0, verbose=2, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=8,
          param_distributions={'n_estimators': [100, 200, 300], 'max_samples': [0.5, 0.8, 1], 'max_features': [0.5, 0.8, 1], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [13]:
randomized.best_estimator_

BaggingRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.5,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1.0, verbosity=1),
         bootstrap=False, bootstrap_features=False, max_features=0.8,
         max_samples=0.8, n_estimators=100, n_jobs=8, oob_score=False,
         random_state=0, verbose=2, warm_start=False)

In [14]:
randomized.best_params_

{'n_estimators': 100,
 'max_samples': 0.8,
 'max_features': 0.8,
 'bootstrap': False}

## Se hacen las dos predicciones necesarias

In [15]:
pred_train = randomized.best_estimator_.predict(X_train)
pred_test = randomized.best_estimator_.predict(X_test)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:   15.7s remaining:   26.2s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:   18.1s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:   18.1s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:    7.0s remaining:   11.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    8.1s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    8.1s finished


### Metrica de Mean Absolute Error (La de Kaggle) ?? (Hay que preguntar)

In [16]:
mae_train = np.sqrt(sklearn.metrics.mean_absolute_error(y_train, pred_train))
mae_test = np.sqrt(sklearn.metrics.mean_absolute_error(y_test, pred_test))
#print(f"MAE train: {mea_train:.5f}")
#print(f"MAE test: {mea_test:.5f}")
print("MAE train: "+str(mae_train))
print("MAE test: "+str(mae_test))

MAE train: 811.9785536726197
MAE test: 816.7335153951409
