In [31]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [32]:
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [6]:
dtypes = {'id': 'int32', 
'titulo': 'object', 
'descripcion': 'object', 
'tipodepropiedad': 'category', 
'direccion': 'object', 
'ciudad': 'object', 
'provincia': 'category', 
'antiguedad': 'float16', 
'habitaciones': 'float16', 
'garages': 'float16', 
'banos': 'float16', 
'metroscubiertos': 'float16', 
'metrostotales': 'float16', 
'idzona': 'object', 
'lat': 'float64', 
'lng': 'float64', 
'gimnasio': 'float16', 
'usosmultiples': 'float16', 
'piscina': 'float16', 
'escuelascercanas': 'float16', 
'centroscomercialescercanos': 'float16', 
'precio': 'float32', }

data = pd.read_csv("train.csv", dtype = dtypes)
data.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,,2.0,1.0,...,23533.0,,,2015-08-23 00:00:00,0.0,0.0,0.0,0.0,0.0,2273000.0
1,53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,...,24514.0,19.310205,-99.227655,2013-06-28 00:00:00,0.0,0.0,0.0,1.0,1.0,3600000.0
2,247984,casa en venta urbi 3 recamaras tonala,descripcion \nla mejor ubicacion residencial e...,Casa,Urbi Tonala,Tonalá,Jalisco,5.0,3.0,2.0,...,48551.0,,,2015-10-17 00:00:00,0.0,0.0,0.0,0.0,0.0,1200000.0
3,209067,casa sola en toluca zinacantepec con credito i...,casa en privada con caseta de vigilancia casas...,Casa,IGNACIO MANUEL ALTAMIRANO 128,Zinacantepec,Edo. de México,1.0,2.0,1.0,...,53666.0,19.30189,-99.688015,2012-03-09 00:00:00,0.0,0.0,0.0,1.0,1.0,650000.0
4,185997,paseos del sol,bonito departamento en excelentes condiciones ...,Apartamento,PASEOS DEL SOL,Zapopan,Jalisco,10.0,2.0,1.0,...,47835.0,,,2016-06-07 00:00:00,0.0,0.0,0.0,0.0,0.0,1150000.0


In [7]:
data['fecha'] = pd.to_datetime(data['fecha'])

El problema es de tipo regresión y poseemos datos estructurados así que vamos a probar algoritmos basados en árboles (Random Forests y XGBoost).

# Preparacion del set de entrenamiento y test

Me quedo solamente con las columnas que son numericas...

In [8]:
data.dtypes

id                                     int32
titulo                                object
descripcion                           object
tipodepropiedad                     category
direccion                             object
ciudad                                object
provincia                           category
antiguedad                           float16
habitaciones                         float16
garages                              float16
banos                                float16
metroscubiertos                      float16
metrostotales                        float16
idzona                                object
lat                                  float64
lng                                  float64
fecha                         datetime64[ns]
gimnasio                             float16
usosmultiples                        float16
piscina                              float16
escuelascercanas                     float16
centroscomercialescercanos           float16
precio    

In [9]:
data_num = data.loc[:,["id", "antiguedad", "habitaciones", "garages", 
                       "banos", "metroscubiertos", "lat", "lng", 
                       "usosmultiples", "piscina", "escuelascercanas",
                      "centroscomercialescercanos", "precio"]]

Se eliminan nulls de forma ciega... (Habria que verificar que esta cantidad no es significativa)

In [11]:
data_num = data_num.dropna()

Se separa en train y test

In [14]:
X = data_num.loc[:,["id", "antiguedad", "habitaciones", "garages", 
                       "banos", "metroscubiertos", "lat", "lng", 
                       "usosmultiples", "piscina", "escuelascercanas",
                      "centroscomercialescercanos"]]
y  = data_num.loc[:, ['precio']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

Se tiene X_train, X_test, y_train, y_test. Donde X_* son los features y y_* son los labes.

In [18]:
y_train.head()

Unnamed: 0,precio
2208,9000000.0
17980,2500000.0
10506,1300000.0
135476,711550.0
95875,899000.0


# Metrica de evaluacion

Se utiliza la metrica propuesta por metaDataNavent:

In [15]:
# Métrica de evaluación
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

# Random Forests

Se genera un Random Forests de 20 estimadores y con todo en default y se estima:

In [34]:
model_RF = RandomForestRegressor(n_estimators = 20, max_features = "auto")
model_RF.fit(X_train, y_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Se calcula la prediccion:

In [35]:
pred_train = model_RF.predict(X_train)

In [36]:
pred = model_RF.predict(X_test)

In [37]:
linear_rmsle_train = np.sqrt(sklearn.metrics.mean_squared_log_error(y_train, pred_train))
linear_rmsle = np.sqrt(sklearn.metrics.mean_squared_log_error(y_test, pred))
print(f"RMSLE LinearRegression (train): {linear_rmsle_train:.5f}")
print(f"RMSLE LinearRegression: {linear_rmsle:.5f}")

RMSLE LinearRegression (train): 0.15358
RMSLE LinearRegression: 0.34709


A mas pequeño mejor