In [15]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


# Se levanta el set de train generado

In [16]:
data_set = pd.read_csv("train_set_xgb.csv")
#X = data_set.loc[:, (data_set.columns != 'precio') & (data_set.columns != 'factor_pm')]
X = data_set.loc[:, (data_set.columns != 'precio')]
y = data_set.loc[:, ['precio']]

In [17]:
data_set.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio',
       'anio_publ'],
      dtype='object')

In [18]:
X.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'anio_publ'],
      dtype='object')

In [19]:
y.columns

Index(['precio'], dtype='object')

# Se convierten los datos a DMatrix 

In [20]:
data_DMatrix = xgb.DMatrix(data = X, label = y)

# Se separa en train y en test

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=1)

In [22]:
len(X_train)

144000

In [23]:
len(X_test)

96000

# Se define el modelo

In [24]:
xg_reg = xgb.XGBRegressor(n_jobs = -1, objective ='reg:squarederror', n_estimators = 200, random_state = 0)

# grid_searchCV. Para busqueda de hiperparametros (tarda MUCHO)

Por ahora esta todo comentado para no usarse cada vez que se corre

In [25]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [1, 0.1, 0.01]
        }

grid = GridSearchCV(xg_reg, params)

grid.fit(X_train, y_train)




GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5], 'learning_rate': [1, 0.1, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
grid.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.5,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1.0, verbosity=1)

In [28]:
grid.best_params_

{'colsample_bytree': 0.8,
 'gamma': 0.5,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 10,
 'subsample': 1.0}

# Se calcula la prediccion del set de test y del set de entrenamiento tambien

In [29]:
pred_train = grid.best_estimator_.predict(X_train)

In [30]:
pred_test = grid.best_estimator_.predict(X_test)

# Se evaluan ambas predicciones

In [40]:
mae_train = np.sqrt(sklearn.metrics.mean_absolute_error(y_train, pred_train))
mae_test = np.sqrt(sklearn.metrics.mean_absolute_error(y_test, pred_test))
#print(f"MAE train: {mea_train:.5f}")
#print(f"MAE test: {mea_test:.5f}")
print("MAE train: "+str(mae_train))
print("MAE test: "+str(mae_test))

MAE train: 801.5843254067258
MAE test: 811.8768418644772


# Importancia de Features

In [None]:
plt.figure(figsize = (50,50))
xgb.plot_importance(random_search.best_estimator_.)
plt.rcParams['figure.figsize'] = [10, 10]
plt.show()

# Se genera el archivo con las predicciones a subir a Kaggle

In [32]:
test_set = pd.read_csv("test_set_xgb.csv")

In [33]:
test_set.columns

Index(['id', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio',
       'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'anio_publ'],
      dtype='object')

Predigo sin el 'id' porque no es parte de los features

In [35]:
pred_a_kaggle = grid.best_estimator_.predict(test_set.loc[:, test_set.columns != 'id'])

In [36]:
pred_a_kaggle

array([5925591.5,  694310. , 2541561.2, ..., 1212985.4, 1867197.1,
       2327913.8], dtype=float32)

In [37]:
df_a_kaggle = pd.DataFrame()
df_a_kaggle['id'] = test_set['id']
df_a_kaggle['target'] = pred_a_kaggle
df_a_kaggle.head()

Unnamed: 0,id,target
0,4941,5925591.5
1,51775,694310.0
2,115253,2541561.25
3,299321,1272962.5
4,173570,596269.75


In [38]:
len(df_a_kaggle)

60000

In [39]:
df_a_kaggle.to_csv("subir_a_kaggle_xgb_individual", header = True, index = False)