In [1]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


# Se levanta el set de train generado

In [2]:
data_set = pd.read_csv("train_set_xgb.csv")
#X = data_set.loc[:, (data_set.columns != 'precio') & (data_set.columns != 'factor_pm')]
X = data_set.loc[:, (data_set.columns != 'precio')]
y = data_set.loc[:, ['precio']]

In [3]:
data_set.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio',
       'anio_publ'],
      dtype='object')

In [4]:
X.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'anio_publ'],
      dtype='object')

In [5]:
y.columns

Index(['precio'], dtype='object')

# Se separa en train y en test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [7]:
len(X_train)

168000

In [8]:
len(X_test)

72000

## Armamos el dataset sobre el cual cada XGBoost va a subsamplear con reposición.

### Esto se hace sobre el X_train con su respectivo label.

In [9]:
data_for_xgb = pd.concat([X_train, y_train], axis = 1)
data_for_xgb.head()

Unnamed: 0,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,anio_publ,precio
239101,0.0,3.0,2.0,3.0,196.0,160.0,,,,0.0,0.0,0.0,0.0,0.0,2014,1800000.0
96221,20.0,4.0,2.0,2.0,245.0,196.0,104734.0,,,0.0,0.0,0.0,1.0,0.0,2012,1500000.0
144453,5.0,3.0,1.0,2.0,108.0,120.0,58952.0,,,0.0,0.0,0.0,1.0,1.0,2015,850000.0
15620,10.0,3.0,1.0,3.0,200.0,147.0,49066.0,,,0.0,0.0,0.0,0.0,0.0,2016,1550000.0
73157,,,0.0,,,250.0,24892.0,19.23339,-99.166992,0.0,0.0,0.0,0.0,0.0,2016,700000.0


# Se entrenan n XGB

Se define la cantidad de xgb a entrenar

In [10]:
n = 100

Se utilizan los valores encontrados en el Grid Search

In [11]:
for i in range(n):
    d_for_current_xgb = data_for_xgb.sample(frac = 1, replace = True, random_state = i)
    X_for_current_xgb = d_for_current_xgb.loc[:, (d_for_current_xgb.columns != 'precio')]
    y_for_current_xgb = d_for_current_xgb.loc[:, ['precio']]
    current_xgb = xgb.XGBRegressor(objective ='reg:squarederror', 
                                colsample_bytree = 0.8, 
                                gamma = 0.5,
                                 learning_rate = 0.1,
                                 max_depth = 5,
                                 min_child_weight = 10,
                                 subsample = 1.0, alpha = 10, n_estimators = 200, n_jobs = -1, random_state = i)
    current_xgb.fit(X_for_current_xgb, y_for_current_xgb)
    current_xgb.save_model("xgb_{}".format(i))
    print("XGB number {} created".format(i))

XGB number 0 created
XGB number 1 created
XGB number 2 created
XGB number 3 created
XGB number 4 created
XGB number 5 created
XGB number 6 created
XGB number 7 created
XGB number 8 created
XGB number 9 created
XGB number 10 created
XGB number 11 created
XGB number 12 created
XGB number 13 created
XGB number 14 created
XGB number 15 created
XGB number 16 created
XGB number 17 created
XGB number 18 created
XGB number 19 created
XGB number 20 created
XGB number 21 created
XGB number 22 created
XGB number 23 created
XGB number 24 created
XGB number 25 created
XGB number 26 created
XGB number 27 created
XGB number 28 created
XGB number 29 created
XGB number 30 created
XGB number 31 created
XGB number 32 created
XGB number 33 created
XGB number 34 created
XGB number 35 created
XGB number 36 created
XGB number 37 created
XGB number 38 created
XGB number 39 created
XGB number 40 created
XGB number 41 created
XGB number 42 created
XGB number 43 created
XGB number 44 created
XGB number 45 create

#### Se usa un subsample de igual tamaño que X_train

### Disclaimer: No se testea el armado de cada XGBoost porque asumo que funciona debido a que es el mismo modelo con los mismos hiperparametros que se usó anteriormente y lo único que cambia es que se está entrenando sobre un subsample de los datos originales.

# Se guarda el split para que el ensamble levante el mismo

In [12]:
X_train.to_csv("X_train", index = False)

In [13]:
X_test.to_csv("X_test", index = False)

In [14]:
y_train.to_csv("y_train", index = False)

In [15]:
y_test.to_csv("y_test", index = False)