In [1]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


# Se levanta el set de train generado

In [2]:
data_set = pd.read_csv("train_set_xgb.csv")
#X = data_set.loc[:, (data_set.columns != 'precio') & (data_set.columns != 'factor_pm')]
X = data_set.loc[:, (data_set.columns != 'precio')]
y = data_set.loc[:, ['precio']]

In [3]:
data_set.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio',
       'tipodepropiedad_1', 'tipodepropiedad_2', 'tipodepropiedad_3',
       'tipodepropiedad_4', 'tipodepropiedad_5', 'tipodepropiedad_6',
       'tipodepropiedad_7', 'tipodepropiedad_8', 'tipodepropiedad_9',
       'tipodepropiedad_10', 'tipodepropiedad_11', 'tipodepropiedad_12',
       'tipodepropiedad_13', 'tipodepropiedad_14', 'tipodepropiedad_15',
       'tipodepropiedad_16', 'tipodepropiedad_17', 'tipodepropiedad_18',
       'tipodepropiedad_19', 'tipodepropiedad_20', 'tipodepropiedad_21',
       'tipodepropiedad_22', 'tipodepropiedad_23', 'tipodepropiedad_24',
       'tipodepropiedad_25', 'provincia_1', 'provincia_2', 'provincia_3',
       'provincia_4', 'provincia_5', 'provincia_6', 'provincia_7',
       'provincia_8', 'provincia_9', 'provincia_10', 

In [4]:
X.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'tipodepropiedad_1', 'tipodepropiedad_2', 'tipodepropiedad_3',
       'tipodepropiedad_4', 'tipodepropiedad_5', 'tipodepropiedad_6',
       'tipodepropiedad_7', 'tipodepropiedad_8', 'tipodepropiedad_9',
       'tipodepropiedad_10', 'tipodepropiedad_11', 'tipodepropiedad_12',
       'tipodepropiedad_13', 'tipodepropiedad_14', 'tipodepropiedad_15',
       'tipodepropiedad_16', 'tipodepropiedad_17', 'tipodepropiedad_18',
       'tipodepropiedad_19', 'tipodepropiedad_20', 'tipodepropiedad_21',
       'tipodepropiedad_22', 'tipodepropiedad_23', 'tipodepropiedad_24',
       'tipodepropiedad_25', 'provincia_1', 'provincia_2', 'provincia_3',
       'provincia_4', 'provincia_5', 'provincia_6', 'provincia_7',
       'provincia_8', 'provincia_9', 'provincia_10', 'provincia

In [5]:
y.columns

Index(['precio'], dtype='object')

# Se separa en train y en test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [7]:
len(X_train)

168000

In [8]:
len(X_test)

72000

## Armamos el dataset sobre el cual cada XGBoost va a subsamplear con reposición.

### Esto se hace sobre el X_train con su respectivo label.

In [9]:
data_for_xgb = pd.concat([X_train, y_train], axis = 1)
data_for_xgb.head()

Unnamed: 0,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,...,anio_publ,mes_publ,cuenta,planta,cuarto,tiene,baño,departamento,excelente,precio
239101,0.0,3.0,2.0,3.0,196.0,160.0,,,,0.0,...,2014,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1800000.0
96221,20.0,4.0,2.0,2.0,245.0,196.0,104734.0,,,0.0,...,2012,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1500000.0
144453,5.0,3.0,1.0,2.0,108.0,120.0,58952.0,,,0.0,...,2015,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,850000.0
15620,10.0,3.0,1.0,3.0,200.0,147.0,49066.0,,,0.0,...,2016,10,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1550000.0
73157,,,0.0,,,250.0,24892.0,19.23339,-99.166992,0.0,...,2016,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,700000.0


# Se entrenan n XGB

Se define la cantidad de xgb a entrenar

In [10]:
n = 100

Se utilizan los valores encontrados en el Grid Search

In [11]:
xgb_created = []

In [12]:
for i in range(n):
    d_for_current_xgb = data_for_xgb.sample(frac = 1, replace = True, random_state = i)
    X_for_current_xgb = d_for_current_xgb.loc[:, (d_for_current_xgb.columns != 'precio')]
    y_for_current_xgb = d_for_current_xgb.loc[:, ['precio']]
    current_xgb = xgb.XGBRegressor(objective ='reg:squarederror', 
                                colsample_bytree = 0.8, 
                                gamma = 0.5,
                                 learning_rate = 0.1,
                                 max_depth = 5,
                                 min_child_weight = 10,
                                 subsample = 1.0, n_estimators = 2400, n_jobs = -1, random_state = i)
    current_xgb.fit(X_for_current_xgb, y_for_current_xgb)
    current_xgb.save_model("xgb_{}".format(i))
    xgb_created.append(current_xgb)
    print("XGB number {} created".format(i))

XGB number 0 created
XGB number 1 created
XGB number 2 created
XGB number 3 created
XGB number 4 created
XGB number 5 created
XGB number 6 created
XGB number 7 created
XGB number 8 created
XGB number 9 created
XGB number 10 created
XGB number 11 created
XGB number 12 created
XGB number 13 created
XGB number 14 created
XGB number 15 created
XGB number 16 created
XGB number 17 created
XGB number 18 created
XGB number 19 created
XGB number 20 created
XGB number 21 created
XGB number 22 created
XGB number 23 created
XGB number 24 created
XGB number 25 created
XGB number 26 created
XGB number 27 created
XGB number 28 created
XGB number 29 created
XGB number 30 created
XGB number 31 created
XGB number 32 created
XGB number 33 created
XGB number 34 created
XGB number 35 created
XGB number 36 created
XGB number 37 created
XGB number 38 created
XGB number 39 created
XGB number 40 created
XGB number 41 created
XGB number 42 created
XGB number 43 created
XGB number 44 created
XGB number 45 create

#### Se usa un subsample de igual tamaño que X_train

### Disclaimer: No se testea el armado de cada XGBoost porque asumo que funciona debido a que es el mismo modelo con los mismos hiperparametros que se usó anteriormente y lo único que cambia es que se está entrenando sobre un subsample de los datos originales.

# Se arma el DF de precios para entrenar el RF

In [13]:
RF_train = pd.DataFrame(index = X_train.index.copy())

In [14]:
for i in range(n):
    current_price_pred = xgb_created[i].predict(X_train)
    RF_train['pred_{}'.format(i)] = current_price_pred
    print("xgb_{} finished pred".format(i))
RF_train.head()

xgb_0 finished pred
xgb_1 finished pred
xgb_2 finished pred
xgb_3 finished pred
xgb_4 finished pred
xgb_5 finished pred
xgb_6 finished pred
xgb_7 finished pred
xgb_8 finished pred
xgb_9 finished pred
xgb_10 finished pred
xgb_11 finished pred
xgb_12 finished pred
xgb_13 finished pred
xgb_14 finished pred
xgb_15 finished pred
xgb_16 finished pred
xgb_17 finished pred
xgb_18 finished pred
xgb_19 finished pred
xgb_20 finished pred
xgb_21 finished pred
xgb_22 finished pred
xgb_23 finished pred
xgb_24 finished pred
xgb_25 finished pred
xgb_26 finished pred
xgb_27 finished pred
xgb_28 finished pred
xgb_29 finished pred
xgb_30 finished pred
xgb_31 finished pred
xgb_32 finished pred
xgb_33 finished pred
xgb_34 finished pred
xgb_35 finished pred
xgb_36 finished pred
xgb_37 finished pred
xgb_38 finished pred
xgb_39 finished pred
xgb_40 finished pred
xgb_41 finished pred
xgb_42 finished pred
xgb_43 finished pred
xgb_44 finished pred
xgb_45 finished pred
xgb_46 finished pred
xgb_47 finished pred
xg

Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_90,pred_91,pred_92,pred_93,pred_94,pred_95,pred_96,pred_97,pred_98,pred_99
239101,1689986.0,1766534.0,1735717.0,1715772.0,1662306.0,1766623.0,1804056.0,1617844.125,1637010.0,1738870.0,...,1821229.25,1684417.0,1665304.0,1676015.0,1838827.0,1794825.625,1827841.75,1842697.375,1644829.0,1688090.0
96221,1749084.0,1760283.125,1743024.0,1864231.875,1783093.0,1743951.0,1847932.0,1731897.25,1732194.0,1771472.0,...,1652129.75,1862086.0,1778841.0,1852972.625,1562068.375,1770447.375,1751814.625,1864356.0,1671443.0,1761365.0
144453,949523.3,936526.25,875078.9,848156.75,926120.4,923559.9,834851.7,775759.75,907512.4,968468.25,...,884079.875,846384.7,815470.7,886198.25,912867.75,974787.75,866842.625,920774.5,719732.3,865822.0
15620,1907250.0,2018018.375,2129554.0,2069715.0,1899689.0,2037284.0,1928701.0,1980490.875,1874637.0,2000020.375,...,1974532.25,1885160.0,1860198.0,1839548.125,2049893.25,2074825.875,2055814.0,2042658.875,2016976.0,1990134.0
73157,1250160.0,1706558.125,676788.7,1090993.375,1500844.0,915099.8,1703472.0,1115108.625,1221259.0,2409765.75,...,1905811.25,1132271.0,1800622.0,1839616.625,1771405.125,1746415.875,845541.5,897758.25,1912185.0,842761.9


In [15]:
len(RF_train)

168000

In [16]:
len(y_train)

168000

## Exporto el RF_train

In [17]:
RF_train.to_csv("RF_train", index = False)

In [9]:
y_train.to_csv("y_train", index = False)

In [10]:
y_test.to_csv("y_test", index = False)

## Armo el DF para RF a partir de X_train

In [18]:
RF_test = pd.DataFrame(index = X_test.index.copy())


In [19]:
for i in range(n):
    current_price_pred = xgb_created[i].predict(X_test)
    RF_test['pred_{}'.format(i)] = current_price_pred
RF_test.head()

Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_90,pred_91,pred_92,pred_93,pred_94,pred_95,pred_96,pred_97,pred_98,pred_99
132827,1659478.0,1471060.0,1455868.0,1862524.0,1625050.75,1505616.0,1762531.0,1662114.0,1517567.25,1840291.0,...,1619646.0,1637417.0,1688696.0,1692034.0,1576439.0,1560470.0,1457542.0,1384460.25,2058913.0,1528536.0
210008,2852040.0,2427407.0,2534238.0,2790467.0,2607273.5,2561150.0,2428484.0,2616929.0,2462442.25,2694016.0,...,2424282.0,2593619.0,2473534.0,2548829.0,2225817.0,2976241.0,2532902.0,2539388.75,2578002.0,2440599.0
189056,2181965.0,2135074.0,1959478.0,2361723.0,2639389.0,2744442.0,2361738.0,2178900.0,2361966.0,2730512.0,...,2203554.0,1979871.0,1857047.0,2089209.0,2828616.0,2180816.0,2402476.0,1661171.5,2629078.0,2707134.0
131466,1195996.0,1269788.0,1232923.0,1250264.0,1192167.25,1328625.0,1333840.0,1120925.0,1624995.375,1462925.0,...,1291122.0,1256644.0,1389990.0,1281326.0,1554499.0,1304531.0,1271230.0,2119825.0,1201555.0,1222957.0
207302,118161.0,344474.2,231211.8,311381.2,268830.0,204046.9,205472.3,205138.6,358404.5,192358.2,...,423946.6,248348.4,206364.9,258496.4,413319.0,232620.5,340319.8,376704.375,294726.8,359990.4


# Se exporta RF_test

In [20]:
RF_test.to_csv("RF_test", index = False)