In [1]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


# Se levanta el set de train generado

In [2]:
data_set = pd.read_csv("train_set_xgb.csv")
#X = data_set.loc[:, (data_set.columns != 'precio') & (data_set.columns != 'factor_pm')]
X = data_set.loc[:, (data_set.columns != 'precio')]
y = data_set.loc[:, ['precio']]

In [3]:
data_set.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio',
       'anio_publ'],
      dtype='object')

In [4]:
X.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'anio_publ'],
      dtype='object')

In [5]:
y.columns

Index(['precio'], dtype='object')

# Se separa en train y en test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [7]:
len(X_train)

168000

In [8]:
len(X_test)

72000

# Se define el objeto Bagging

In [9]:
defined_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.5,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='reg:squarederror', reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1.0, verbosity=1)

In [11]:
bag = BaggingRegressor(base_estimator = defined_xgb, n_estimators = 100,
 max_samples = 0.8,
 max_features = 0.8,
 bootstrap = False, bootstrap_features = False, n_jobs = -1, random_state = 0, verbose = 2)

# Se entrena el modelo

In [12]:
bag.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:  5.7min remaining: 17.0min
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:  6.0min remaining:  1.2min
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:  6.0min finished


BaggingRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.5,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1.0, verbosity=1),
         bootstrap=False, bootstrap_features=False, max_features=0.8,
         max_samples=0.8, n_estimators=100, n_jobs=-1, oob_score=False,
         random_state=0, verbose=2, warm_start=False)

## Se hacen las dos predicciones necesarias

In [13]:
pred_train = bag.predict(X_train)
pred_test = bag.predict(X_test)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:   16.9s remaining:   50.8s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:   17.6s remaining:    3.4s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   17.9s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:    6.9s remaining:   20.8s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:    7.7s remaining:    1.5s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    7.8s finished


### Metrica de Mean Absolute Error (La de Kaggle) ?? (Hay que preguntar)

In [14]:
mae_train = np.sqrt(sklearn.metrics.mean_absolute_error(y_train, pred_train))
mae_test = np.sqrt(sklearn.metrics.mean_absolute_error(y_test, pred_test))
#print(f"MAE train: {mea_train:.5f}")
#print(f"MAE test: {mea_test:.5f}")
print("MAE train: "+str(mae_train))
print("MAE test: "+str(mae_test))

MAE train: 811.9785535723807
MAE test: 816.7335158133677


# Se genera el archivo para subir a Kaggle

In [15]:
test_set = pd.read_csv("test_set_xgb.csv")

In [16]:
test_set.columns

Index(['id', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio',
       'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'anio_publ'],
      dtype='object')

In [17]:
X_test_set = test_set.loc[:, test_set.columns != 'id']

In [18]:
pred_a_kaggle = bag.predict(X_test_set)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:    5.9s remaining:   18.0s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:    6.3s remaining:    1.2s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    6.3s finished


In [19]:
df_a_kaggle = pd.DataFrame()
df_a_kaggle['id'] = test_set['id']
df_a_kaggle['target'] = pred_a_kaggle
df_a_kaggle.head()

Unnamed: 0,id,target
0,4941,5749591.0
1,51775,885256.625
2,115253,2462406.75
3,299321,1252463.375
4,173570,594197.125


In [20]:
df_a_kaggle.to_csv("subir_a_kaggle", header = True, index = False)