In [280]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import category_encoders as ce

In [281]:
dtypes = {'id': 'int32', 
'titulo': 'object', 
'descripcion': 'object', 
'tipodepropiedad': 'category', 
'direccion': 'object', 
'ciudad': 'object', 
'provincia': 'category', 
'antiguedad': 'float', 
'habitaciones': 'float', 
'garages': 'float', 
'banos': 'float', 
'metroscubiertos': 'float', 
'metrostotales': 'float', 
'idzona': 'object', 
'lat': 'float64', 
'lng': 'float64', 
'gimnasio': 'float', 
'usosmultiples': 'float', 
'piscina': 'float', 
'escuelascercanas': 'float', 
'centroscomercialescercanos': 'float', 
'precio': 'float', }

data = pd.read_csv("train.csv", dtype = dtypes)
data.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,,2.0,1.0,...,23533.0,,,2015-08-23 00:00:00,0.0,0.0,0.0,0.0,0.0,2273000.0
1,53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,...,24514.0,19.310205,-99.227655,2013-06-28 00:00:00,0.0,0.0,0.0,1.0,1.0,3600000.0
2,247984,casa en venta urbi 3 recamaras tonala,descripcion \nla mejor ubicacion residencial e...,Casa,Urbi Tonala,Tonalá,Jalisco,5.0,3.0,2.0,...,48551.0,,,2015-10-17 00:00:00,0.0,0.0,0.0,0.0,0.0,1200000.0
3,209067,casa sola en toluca zinacantepec con credito i...,casa en privada con caseta de vigilancia casas...,Casa,IGNACIO MANUEL ALTAMIRANO 128,Zinacantepec,Edo. de México,1.0,2.0,1.0,...,53666.0,19.30189,-99.688015,2012-03-09 00:00:00,0.0,0.0,0.0,1.0,1.0,650000.0
4,185997,paseos del sol,bonito departamento en excelentes condiciones ...,Apartamento,PASEOS DEL SOL,Zapopan,Jalisco,10.0,2.0,1.0,...,47835.0,,,2016-06-07 00:00:00,0.0,0.0,0.0,0.0,0.0,1150000.0


# Pre-procesamiento de data

In [282]:
data["ciudad"].nunique()

875

In [283]:
data.isnull().sum()

id                                 0
titulo                          5387
descripcion                     1619
tipodepropiedad                   46
direccion                      53072
ciudad                           372
provincia                        155
antiguedad                     43555
habitaciones                   22471
garages                        37765
banos                          26221
metroscubiertos                17400
metrostotales                  51467
idzona                         28621
lat                           123488
lng                           123488
fecha                              0
gimnasio                           0
usosmultiples                      0
piscina                            0
escuelascercanas                   0
centroscomercialescercanos         0
precio                             0
dtype: int64

In [284]:
data.isnull().sum().sum()

535127

In [285]:
data.size

5520000

In [286]:
(data.isnull().sum().sum()/(data.size))*100

9.694329710144928

Los Nulls representan el 10% de los datos.

# CON XGBOOST NO HACE FALTA PREOCUPARSE POR LOS NULLs

# Levanto el csv de test para calcularle los features en paralelo

In [287]:
test = pd.read_csv("test.csv", dtype = dtypes)
test.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,metrostotales,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,4941,"casa en venta en miguel hidalgo, distrito federal",<p>excelente casa estilo moderno.</p>,Casa,Bosque de Cedros,Miguel Hidalgo,Distrito Federal,29.0,3.0,,...,,,19.408668,-99.246767,2013-07-20 00:00:00,0.0,0.0,0.0,0.0,0.0
1,51775,departamentos en venta en montebello,<p>departamento una recamara:\n</p><p>departam...,Apartamento,,Mérida,Yucatán,,1.0,1.0,...,67.0,113851.0,21.03248,-89.592424,2015-10-24 00:00:00,0.0,0.0,0.0,0.0,0.0
2,115253,departamento nuevo delegación coyoacán de 87 m...,"departamento nuevo de 87.06 m2, 1 cajón de est...",Apartamento,"Pueblo de los Reyes, Coyoacán, Mexico D.F.",Coyoacán,Distrito Federal,0.0,2.0,1.0,...,100.0,23620.0,19.332829,-99.152913,2015-05-30 00:00:00,0.0,0.0,0.0,0.0,1.0
3,299321,departamento en venta en acapulco,<p> raíces dv001 precioso departamento tipo k...,Apartamento,,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,...,86.0,129347.0,16.860487,-99.878383,2015-04-02 00:00:00,0.0,0.0,0.0,0.0,0.0
4,173570,bonita casa sola equipada de dos niveles en lo...,"<p>casa sola, bonita de dos rec&aacute;maras u...",Casa,CEDROS,Tultitlán,Edo. de México,10.0,2.0,1.0,...,76.0,57125.0,19.640482,-99.127273,2013-08-15 00:00:00,0.0,0.0,0.0,1.0,1.0


In [288]:
len(test)

60000

# Preparacion del set de datos de entrenamiento (features)

La idea es preparar el set con los datos para exportar y que despues el modelo simplemente levante, separe en set de entrenamiento y test, entrene y devuelva una prediccion.

Se procede a calcular features. Cada feature se agregara al DataFrame final que tendra que levantar despues el modelo. Ojo que dentro de este DataFrame tambien va a estar el precio (que es el label).

In [289]:
train_set = pd.DataFrame()

In [290]:
test_set = pd.DataFrame()

Primero de todo, se agrega el id y el precio asocidado. A partir de esto, se agregaran los features que se consideren necesarios.

In [291]:
train_set["id"] = data["id"]
train_set["precio"] = data["precio"]

In [292]:
train_set.head()

Unnamed: 0,id,precio
0,254099,2273000.0
1,53461,3600000.0
2,247984,1200000.0
3,209067,650000.0
4,185997,1150000.0


A partir de esto se generan features.

Igual pero para test_set. Solo que sin el precio.

In [293]:
test_set["id"] = test["id"]

In [294]:
test_set.head()

Unnamed: 0,id
0,4941
1,51775
2,115253
3,299321
4,173570


In [295]:
len(test_set)

60000

# Se codifican las variables categoricas

In [296]:
data['train'] = True
test['train'] = False
combined = pd.concat([data, test], sort = True)

In [297]:
train_set['train'] = True
test_set["train"] = False
combined_set = pd.concat([train_set, test_set], sort = False)

In [298]:
len(combined_set)

300000

In [299]:
combined['tipodepropiedad'].nunique()

24

In [300]:
combined['ciudad'].nunique()

921

Ciudad tiene demasiados valores posibles para OneHotEncoding

In [301]:
combined['provincia'].nunique()

32

In [302]:
combined_set['tipodepropiedad'] = combined['tipodepropiedad']
combined_set['provincia'] = combined['provincia']

In [303]:
len(combined_set)

300000

In [304]:
var_categoricas = ['tipodepropiedad', 'provincia']

In [305]:
one_hot_enc = ce.OneHotEncoder(handle_unknown = 'ignore')
one_hot_encoded = one_hot_enc.fit_transform(combined_set[var_categoricas])
one_hot_encoded.columns

Index(['tipodepropiedad_1', 'tipodepropiedad_2', 'tipodepropiedad_3',
       'tipodepropiedad_4', 'tipodepropiedad_5', 'tipodepropiedad_6',
       'tipodepropiedad_7', 'tipodepropiedad_8', 'tipodepropiedad_9',
       'tipodepropiedad_10', 'tipodepropiedad_11', 'tipodepropiedad_12',
       'tipodepropiedad_13', 'tipodepropiedad_14', 'tipodepropiedad_15',
       'tipodepropiedad_16', 'tipodepropiedad_17', 'tipodepropiedad_18',
       'tipodepropiedad_19', 'tipodepropiedad_20', 'tipodepropiedad_21',
       'tipodepropiedad_22', 'tipodepropiedad_23', 'tipodepropiedad_24',
       'tipodepropiedad_25', 'provincia_1', 'provincia_2', 'provincia_3',
       'provincia_4', 'provincia_5', 'provincia_6', 'provincia_7',
       'provincia_8', 'provincia_9', 'provincia_10', 'provincia_11',
       'provincia_12', 'provincia_13', 'provincia_14', 'provincia_15',
       'provincia_16', 'provincia_17', 'provincia_18', 'provincia_19',
       'provincia_20', 'provincia_21', 'provincia_22', 'provincia_23',
  

In [306]:
combined_set = combined_set.join(one_hot_encoded)
combined_set.columns

Index(['id', 'precio', 'train', 'tipodepropiedad', 'provincia',
       'tipodepropiedad_1', 'tipodepropiedad_2', 'tipodepropiedad_3',
       'tipodepropiedad_4', 'tipodepropiedad_5', 'tipodepropiedad_6',
       'tipodepropiedad_7', 'tipodepropiedad_8', 'tipodepropiedad_9',
       'tipodepropiedad_10', 'tipodepropiedad_11', 'tipodepropiedad_12',
       'tipodepropiedad_13', 'tipodepropiedad_14', 'tipodepropiedad_15',
       'tipodepropiedad_16', 'tipodepropiedad_17', 'tipodepropiedad_18',
       'tipodepropiedad_19', 'tipodepropiedad_20', 'tipodepropiedad_21',
       'tipodepropiedad_22', 'tipodepropiedad_23', 'tipodepropiedad_24',
       'tipodepropiedad_25', 'provincia_1', 'provincia_2', 'provincia_3',
       'provincia_4', 'provincia_5', 'provincia_6', 'provincia_7',
       'provincia_8', 'provincia_9', 'provincia_10', 'provincia_11',
       'provincia_12', 'provincia_13', 'provincia_14', 'provincia_15',
       'provincia_16', 'provincia_17', 'provincia_18', 'provincia_19',
       'p

In [307]:
len(combined_set)

420000

In [308]:
len(train_set)

240000

In [309]:
len(test_set)

60000

In [310]:
train_set = combined_set[combined_set["train"] == True]
test_set = combined_set[combined_set["train"] == False]

In [311]:
len(train_set)

300000

In [312]:
len(test_set)

120000

In [313]:
train_set = train_set.drop(columns = ['tipodepropiedad', 'provincia' , 'train'])

In [314]:
train_set.columns

Index(['id', 'precio', 'tipodepropiedad_1', 'tipodepropiedad_2',
       'tipodepropiedad_3', 'tipodepropiedad_4', 'tipodepropiedad_5',
       'tipodepropiedad_6', 'tipodepropiedad_7', 'tipodepropiedad_8',
       'tipodepropiedad_9', 'tipodepropiedad_10', 'tipodepropiedad_11',
       'tipodepropiedad_12', 'tipodepropiedad_13', 'tipodepropiedad_14',
       'tipodepropiedad_15', 'tipodepropiedad_16', 'tipodepropiedad_17',
       'tipodepropiedad_18', 'tipodepropiedad_19', 'tipodepropiedad_20',
       'tipodepropiedad_21', 'tipodepropiedad_22', 'tipodepropiedad_23',
       'tipodepropiedad_24', 'tipodepropiedad_25', 'provincia_1',
       'provincia_2', 'provincia_3', 'provincia_4', 'provincia_5',
       'provincia_6', 'provincia_7', 'provincia_8', 'provincia_9',
       'provincia_10', 'provincia_11', 'provincia_12', 'provincia_13',
       'provincia_14', 'provincia_15', 'provincia_16', 'provincia_17',
       'provincia_18', 'provincia_19', 'provincia_20', 'provincia_21',
       'provincia_

In [315]:
test_set = test_set.drop(columns = ['tipodepropiedad', 'precio' , 'provincia', 'train'])

In [316]:
test_set.columns

Index(['id', 'tipodepropiedad_1', 'tipodepropiedad_2', 'tipodepropiedad_3',
       'tipodepropiedad_4', 'tipodepropiedad_5', 'tipodepropiedad_6',
       'tipodepropiedad_7', 'tipodepropiedad_8', 'tipodepropiedad_9',
       'tipodepropiedad_10', 'tipodepropiedad_11', 'tipodepropiedad_12',
       'tipodepropiedad_13', 'tipodepropiedad_14', 'tipodepropiedad_15',
       'tipodepropiedad_16', 'tipodepropiedad_17', 'tipodepropiedad_18',
       'tipodepropiedad_19', 'tipodepropiedad_20', 'tipodepropiedad_21',
       'tipodepropiedad_22', 'tipodepropiedad_23', 'tipodepropiedad_24',
       'tipodepropiedad_25', 'provincia_1', 'provincia_2', 'provincia_3',
       'provincia_4', 'provincia_5', 'provincia_6', 'provincia_7',
       'provincia_8', 'provincia_9', 'provincia_10', 'provincia_11',
       'provincia_12', 'provincia_13', 'provincia_14', 'provincia_15',
       'provincia_16', 'provincia_17', 'provincia_18', 'provincia_19',
       'provincia_20', 'provincia_21', 'provincia_22', 'provincia_2

In [317]:
len(test)

60000

In [318]:
len(test_set)

120000

In [319]:
test_set = test_set.drop_duplicates("id")

In [320]:
len(test_set)

60000

In [321]:
train_set = train_set.drop_duplicates("id")

In [322]:
len(train_set)

240000

## Se agregan todas las columnas numericas que ya se tienen

In [323]:
data.dtypes

id                               int32
titulo                          object
descripcion                     object
tipodepropiedad               category
direccion                       object
ciudad                          object
provincia                     category
antiguedad                     float64
habitaciones                   float64
garages                        float64
banos                          float64
metroscubiertos                float64
metrostotales                  float64
idzona                          object
lat                            float64
lng                            float64
fecha                           object
gimnasio                       float64
usosmultiples                  float64
piscina                        float64
escuelascercanas               float64
centroscomercialescercanos     float64
precio                         float64
train                             bool
dtype: object

In [324]:
data_num = data.loc[:,["id", "usosmultiples", "piscina", "escuelascercanas",
                      "centroscomercialescercanos"]]

In [325]:
train_set = train_set.merge(data_num, on = ["id"], how = "inner")
train_set.head()

Unnamed: 0,id,precio,tipodepropiedad_1,tipodepropiedad_2,tipodepropiedad_3,tipodepropiedad_4,tipodepropiedad_5,tipodepropiedad_6,tipodepropiedad_7,tipodepropiedad_8,...,provincia_27,provincia_28,provincia_29,provincia_30,provincia_31,provincia_32,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,254099,2273000.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,53461,3600000.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,1.0,1.0
2,247984,1200000.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
3,209067,650000.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,1.0,1.0
4,185997,1150000.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [326]:
test_num = test.loc[:,["id", "usosmultiples", "piscina", "escuelascercanas",
                      "centroscomercialescercanos"]]

In [327]:
test_set = test_set.merge(test_num, on = ["id"], how = "inner")
test_set.head()

Unnamed: 0,id,tipodepropiedad_1,tipodepropiedad_2,tipodepropiedad_3,tipodepropiedad_4,tipodepropiedad_5,tipodepropiedad_6,tipodepropiedad_7,tipodepropiedad_8,tipodepropiedad_9,...,provincia_27,provincia_28,provincia_29,provincia_30,provincia_31,provincia_32,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,4941,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,51775,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,115253,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,1.0
3,299321,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
4,173570,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,1.0,1.0


## Se agregan el resto de las variables

In [328]:
data_num_fill = data.loc[:, ["id", "antiguedad", "metrostotales", "metroscubiertos", 'garages', 'banos', 'lat', 'lng']]

In [329]:
train_set = train_set.merge(data_num_fill, on = 'id', how = 'inner')
train_set.head()

Unnamed: 0,id,precio,tipodepropiedad_1,tipodepropiedad_2,tipodepropiedad_3,tipodepropiedad_4,tipodepropiedad_5,tipodepropiedad_6,tipodepropiedad_7,tipodepropiedad_8,...,piscina,escuelascercanas,centroscomercialescercanos,antiguedad,metrostotales,metroscubiertos,garages,banos,lat,lng
0,254099,2273000.0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,,80.0,80.0,1.0,2.0,,
1,53461,3600000.0,0,1,0,0,0,0,0,0,...,0.0,1.0,1.0,10.0,180.0,268.0,2.0,2.0,19.310205,-99.227655
2,247984,1200000.0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,5.0,166.0,144.0,2.0,2.0,,
3,209067,650000.0,0,0,1,0,0,0,0,0,...,0.0,1.0,1.0,1.0,67.0,63.0,1.0,1.0,19.30189,-99.688015
4,185997,1150000.0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,10.0,95.0,95.0,1.0,1.0,,


In [330]:
test_num_fill = test.loc[:, ["id", "antiguedad", "metrostotales", "metroscubiertos", 'garages', 'banos', 'lat', 'lng']]

In [331]:
test_set = test_set.merge(test_num_fill, on = 'id', how = 'inner')
test_set.head()

Unnamed: 0,id,tipodepropiedad_1,tipodepropiedad_2,tipodepropiedad_3,tipodepropiedad_4,tipodepropiedad_5,tipodepropiedad_6,tipodepropiedad_7,tipodepropiedad_8,tipodepropiedad_9,...,piscina,escuelascercanas,centroscomercialescercanos,antiguedad,metrostotales,metroscubiertos,garages,banos,lat,lng
0,4941,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,29.0,,300.0,,4.0,19.408668,-99.246767
1,51775,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,,67.0,67.0,1.0,1.0,21.03248,-89.592424
2,115253,0,0,1,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,100.0,87.0,1.0,2.0,19.332829,-99.152913
3,299321,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,2.0,86.0,86.0,2.0,2.0,16.860487,-99.878383
4,173570,1,0,0,0,0,0,0,0,0,...,0.0,1.0,1.0,10.0,76.0,80.0,1.0,1.0,19.640482,-99.127273


In [332]:
len(test_set)

60000

In [333]:
train_set.isnull().sum().sum()

423384

# Nuevos features

# Promedio de metroscubiertos por tipo de propiedad

In [334]:
metros_mean = data.groupby(by = 'tipodepropiedad').agg({"metroscubiertos" : "mean"}).reset_index()

In [335]:
metros_mean.head()

Unnamed: 0,tipodepropiedad,metroscubiertos
0,Apartamento,117.048657
1,Bodega comercial,251.153639
2,Casa,194.026911
3,Casa en condominio,187.697231
4,Casa uso de suelo,254.068966


In [336]:
metros_mean.rename(columns = {"metroscubiertos" : "metroscubiertos_mean_tipodeprop"}, inplace = True)
metros_mean.head()

Unnamed: 0,tipodepropiedad,metroscubiertos_mean_tipodeprop
0,Apartamento,117.048657
1,Bodega comercial,251.153639
2,Casa,194.026911
3,Casa en condominio,187.697231
4,Casa uso de suelo,254.068966


In [337]:
data = data.merge(metros_mean, on = 'tipodepropiedad')

In [338]:
train_set = train_set.merge(data[['id', 'metroscubiertos_mean_tipodeprop']], on = "id")

### Lo mismo pero para test

In [339]:
metros_mean = test.groupby(by = 'tipodepropiedad').agg({"metroscubiertos" : "mean"}).reset_index()

In [340]:
metros_mean.head()

Unnamed: 0,tipodepropiedad,metroscubiertos
0,Apartamento,116.138123
1,Bodega comercial,250.683849
2,Casa,193.814888
3,Casa en condominio,186.547166
4,Casa uso de suelo,241.857895


In [341]:
metros_mean.rename(columns = {"metroscubiertos" : "metroscubiertos_mean_tipodeprop"}, inplace = True)
metros_mean.head()

Unnamed: 0,tipodepropiedad,metroscubiertos_mean_tipodeprop
0,Apartamento,116.138123
1,Bodega comercial,250.683849
2,Casa,193.814888
3,Casa en condominio,186.547166
4,Casa uso de suelo,241.857895


In [342]:
test = test.merge(metros_mean, on = 'tipodepropiedad')

In [343]:
test_set = test_set.merge(test[['id', 'metroscubiertos_mean_tipodeprop']], on = "id", how = "left")

In [344]:
len(test_set)

60000

# Varianza de metroscubiertos por tipo de propiedad

In [345]:
metros_var = data.groupby(by = 'tipodepropiedad').agg({"metroscubiertos" : "var"}).reset_index()

In [346]:
metros_var.rename(columns = {"metroscubiertos" : "metroscubiertos_var_tipodeprop"}, inplace = True)

In [347]:
data = data.merge(metros_var, on = 'tipodepropiedad')

In [348]:
train_set = train_set.merge(data[['id', 'metroscubiertos_var_tipodeprop']], on = "id")

### Lo mismo pero para test

In [349]:
metros_var = test.groupby(by = 'tipodepropiedad').agg({"metroscubiertos" : "var"}).reset_index()

In [350]:
metros_var.rename(columns = {"metroscubiertos" : "metroscubiertos_var_tipodeprop"}, inplace = True)

In [351]:
test = test.merge(metros_var, on = 'tipodepropiedad')

In [352]:
test_set = test_set.merge(test[['id', 'metroscubiertos_var_tipodeprop']], on = "id", how = "left")

In [353]:
len(test_set)

60000

# Precio promedio según tipo de propiedad

In [354]:
precio_mean = data.groupby(by = 'tipodepropiedad').agg({"precio" : "mean"}).reset_index()

In [355]:
precio_mean.head()

Unnamed: 0,tipodepropiedad,precio
0,Apartamento,2763769.0
1,Bodega comercial,2693758.0
2,Casa,2398158.0
3,Casa en condominio,2898927.0
4,Casa uso de suelo,3732469.0


In [356]:
precio_mean.rename(columns = {"precio" : "precio_mean_tipodeprop"}, inplace = True)
precio_mean.head()

Unnamed: 0,tipodepropiedad,precio_mean_tipodeprop
0,Apartamento,2763769.0
1,Bodega comercial,2693758.0
2,Casa,2398158.0
3,Casa en condominio,2898927.0
4,Casa uso de suelo,3732469.0


In [357]:
data = data.merge(precio_mean, on = 'tipodepropiedad')

In [358]:
train_set = train_set.merge(data[['id', 'precio_mean_tipodeprop']], on = "id")

### Lo mismo pero para test

In [359]:
test = test.merge(precio_mean, on = 'tipodepropiedad')

In [360]:
test_set = test_set.merge(test[['id', 'precio_mean_tipodeprop']], on = "id", how = "left")

In [361]:
len(test_set)

60000

# Varianza del precio segun tipo de propiedad

In [362]:
precio_var = data.groupby(by = 'tipodepropiedad').agg({"precio" : "var"}).reset_index()

In [363]:
precio_var.head()

Unnamed: 0,tipodepropiedad,precio
0,Apartamento,5488593000000.0
1,Bodega comercial,4009818000000.0
2,Casa,3997227000000.0
3,Casa en condominio,5359409000000.0
4,Casa uso de suelo,6400712000000.0


In [364]:
precio_var.rename(columns = {"precio" : "precio_var_tipodeprop"}, inplace = True)
precio_var.head()

Unnamed: 0,tipodepropiedad,precio_var_tipodeprop
0,Apartamento,5488593000000.0
1,Bodega comercial,4009818000000.0
2,Casa,3997227000000.0
3,Casa en condominio,5359409000000.0
4,Casa uso de suelo,6400712000000.0


In [365]:
data = data.merge(precio_var, on = 'tipodepropiedad')

In [366]:
train_set = train_set.merge(data[['id', 'precio_var_tipodeprop']], on = "id")

### Lo mismo pero para test

In [367]:
test = test.merge(precio_var, on = 'tipodepropiedad')

In [368]:
test_set = test_set.merge(test[['id', 'precio_var_tipodeprop']], on = "id", how = "left")

In [369]:
len(test_set)

60000

# Precio promedio  para intervalos de metros

Da muy mal este feature

In [370]:
# bins = 10
# precio_mean_mts = data.groupby(by = pd.cut(data['metroscubiertos'], bins)).agg({"precio" : "mean"}).reset_index()
# data['metroscubiertos_bin'] = pd.cut(data["metroscubiertos"], bins)
# precio_mean_mts.head()

In [371]:
# precio_mean_mts.rename(columns = {"metroscubiertos":"metroscubiertos_bin", "precio":"precio_mean_mts"}, inplace = True)
# precio_mean_mts.head()

In [372]:
# data = data.merge(precio_mean_mts, on = 'metroscubiertos_bin')

In [373]:
# train_set = train_set.merge(data[['id', 'precio_mean_mts']], on = "id")

In [374]:
# precio_mean_mts['metroscubiertos_bin']

In [375]:
# bins = pd.IntervalIndex.from_tuples([(14.576, 57.4),
#       (57.4, 99.8),
#      (99.8, 142.2),
#     (142.2, 184.6),
#     (184.6, 227.0),
#     (227.0, 269.4),
#     (269.4, 311.8),
#     (311.8, 354.2),
#     (354.2, 396.6),
#     (396.6, 439.0)], closed = 'right' )
# test_set['metroscubiertos_bin'] = pd.cut(test_set['metroscubiertos'], bins)

In [376]:
# test_set['metroscubiertos_bin'].head()

In [377]:
# precio_mean_mts.head()

In [378]:
# test_set = test_set.merge(precio_mean_mts, on = "metroscubiertos_bin", how = 'left')

In [379]:
# test_set.head()

# Varianza de precio para intervalos de metros

Da muy mal este feature

In [380]:
# bins = 10
# precio_var_mts = data.groupby(by = pd.cut(data['metroscubiertos'], bins)).agg({"precio" : "var"}).reset_index()
# data['metroscubiertos_bin'] = pd.cut(data["metroscubiertos"], bins)
# precio_var_mts.head()

In [381]:
# precio_var_mts.rename(columns = {"metroscubiertos":"metroscubiertos_bin", "precio":"precio_var_mts"}, inplace = True)
# precio_var_mts.head()

In [382]:
# data = data.merge(precio_var_mts, on = 'metroscubiertos_bin')

In [383]:
# train_set = train_set.merge(data[['id', 'precio_var_mts']], on = "id")

In [384]:
# precio_var_mts['metroscubiertos_bin']

In [385]:
# bins = pd.IntervalIndex.from_tuples([(14.576, 57.4),
#       (57.4, 99.8),
#      (99.8, 142.2),
#     (142.2, 184.6),
#     (184.6, 227.0),
#     (227.0, 269.4),
#     (269.4, 311.8),
#     (311.8, 354.2),
#     (354.2, 396.6),
#     (396.6, 439.0)], closed = 'right' )
# test_set['metroscubiertos_bin'] = pd.cut(test_set['metroscubiertos'], bins)

In [386]:
# test_set['metroscubiertos_bin'].head()

In [387]:
# precio_var_mts.head()

In [388]:
# test_set = test_set.merge(precio_var_mts, on = "metroscubiertos_bin", how = 'left')

In [389]:
# test_set.head()

In [390]:
# del test_set['metroscubiertos_bin']

# Precio promedio por latitud y longitud

In [391]:
# bins = 10
# data['lat_bin'] = pd.cut(data["lat"], bins)
# data['lng_bin'] = pd.cut(data['lng'], bins)
# precio_mean_lat_lng = data.groupby(by = ['lat_bin', 'lng_bin']).agg({"precio" : "mean"}).reset_index()
# precio_mean_lat_lng.head()

In [392]:
# precio_mean_lat_lng.rename(columns = {"precio":"precio_mean_lat_lng_bin"}, inplace = True)
# precio_mean_lat_lng.head()S

In [393]:
# data = data.merge(precio_mean_lat_lng, on = ['lat_bin', 'lng_bin'])
# data['precio_mean_lat_lng_bin']

In [394]:
# train_set = train_set.merge(data[['id', 'precio_mean_lat_lng_bin']], on = "id")

In [395]:
# precio_mean_lat_lng['lat_bin'].unique()

In [396]:
# precio_mean_lat_lng['lng_bin'].unique()

In [397]:
# bins = pd.IntervalIndex.from_tuples([(-80.79, -64.261), (-64.261, -47.896), 
# (-47.896, -31.531), (-31.531, -15.165), 
# (-15.165, 1.2), (1.2, 17.565), 
# (17.565, 33.93), (33.93, 50.296), 
# (50.296, 66.661), (66.661, 83.026)], closed = 'right' )
# test_set['lat_bin'] = pd.cut(test_set['lat'], bins)

In [398]:
# bins = pd.IntervalIndex.from_tuples([(-126.106, -101.17), (-101.17, -76.48), 
# (-76.48, -51.791), (-51.791, -27.101), 
# (-27.101, -2.412), (-2.412, 22.278), 
# (22.278, 46.967), (46.967, 71.657), 
# (71.657, 96.346), (96.346, 121.036)], closed = 'right' )
# test_set['lng_bin'] = pd.cut(test_set['lng'], bins)

In [399]:
# test_set['lng_bin']

In [400]:
# test_set['lat_bin']

In [401]:
# precio_mean_lat_lng['lat_bin']

In [402]:
# test_set = test_set.merge(precio_mean_lat_lng, on = ["lat_bin"], how = 'left')

Por que lat_bin no anda pero lng_bin si? Si son exactamente el mismo tipo

In [403]:
# test_set.head()

In [404]:
# del test['lat_bin']
# del test['lng_bin']

# Antiguedad promedio

In [405]:
data.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,239944,239945,239946,239947,239948,239949,239950,239951,239952,239953
id,254099,185997,126147,44962,103293,73348,257302,127666,212847,224685,...,204578,10447,189648,257049,70279,169168,104078,136796,16605,245563
titulo,depto. tipo a-402,paseos del sol,departamento en venta taxqueña,pre- venta preciosos depas 2 recamaras con sub...,"departamento en venta, san pedro garza garcia,...","oportunidad, departamento col del valle, 3 rec...",,hermoso departamento!!!,departamento en arcos de zapopan,renta en acapulco con alberca prvada vista a b...,...,granja en venta granjas montenegro,00477rmvsu campo agricola san miguel de horcas...,"hermosa granja de 10,000 m2 en el sauz, a 40 m...","terreno / lote en venta, alvarado, veracruz",terreno/lote - cerrada dr. duran,terreno/lote - paseo ex hacienda barbosa,terreno/lote - privada de rio nasas,terreno/lote - 3er. cerrada de juan escutia,hospedaje en venta en 23 de julio,
descripcion,"depto. interior de 80.15m2, consta de sala com...",bonito departamento en excelentes condiciones ...,"amplio departamento, estancia de sala y comedo...",<p>pre-venta de preciosos departamento ecologi...,"departamento nuevo ,256 m2 de construccion,un ...",magnifico departamento con excelente distribuc...,-- hav1407e-285 -- excelente departamento e...,hermoso departamento con acabados de muy buen ...,bonito departamento muy bien ubicado en planta...,bonito departamento cerca de sam´s club farall...,...,"bonita granja con gran ubicacion,ideal para de...","<p>excelente campo agricola 20 has, rumbo a sa...","10,000 m² de terreno, 60 m² de construcción, v...",chatea con nosotros para conocer mas sobre est...,"excelente predio para demoler, ideal para cons...","oportunidad de terreno, plano, con servicio de...",excelente opción de terreno para las familias ...,bonito terreno ubicado en una zona de crecimie...,venta edificio en ciudad del carmen. excelente...,"4 lugares de estacionamiento (40 m2 en total, ..."
tipodepropiedad,Apartamento,Apartamento,Apartamento,Apartamento,Apartamento,Apartamento,Apartamento,Apartamento,Apartamento,Apartamento,...,Huerta,Huerta,Huerta,Lote,Lote,Lote,Lote,Lote,Hospedaje,Garage
direccion,Avenida Division del Norte 2005,PASEOS DEL SOL,Condominio Tlalpan 2B,BUENAVISTA DEPTOS CON SUBSIDIO,,Pazaje Santa Cruz,Rio Mante .,LAS COLONIAS,ARCO PERTINAX,RANCHO ACAPULCO,...,Camino a la Presa # 5600,CARRETERA A SAN MIGUEL DE HORCASITAS KM 19,El Sauz,,,,,,"Machiche, 23 de Julio, Ciudad del Carmen",Mineria
ciudad,Benito Juárez,Zapopan,Coyoacán,Villa de Alvarez,San Pedro Garza García,Benito Juárez,Ciudad Madero,Atizapán de Zaragoza,Zapopan,Acapulco de Juárez,...,El Salto,San Miguel de Horcasitas,Chihuahua,Alvarado,Cuauhtémoc,Zinacantepec,Metepec,Toluca,Carmen,Miguel Hidalgo
provincia,Distrito Federal,Jalisco,Distrito Federal,Colima,Nuevo León,Distrito Federal,Tamaulipas,Edo. de México,Jalisco,Guerrero,...,Jalisco,Sonora,Chihuahua,Veracruz,Distrito Federal,Edo. de México,Edo. de México,Edo. de México,Campeche,Distrito Federal
antiguedad,,10,5,1,,5,0,20,10,3,...,10,3,10,0,,,,,,
habitaciones,2,2,2,2,3,3,2,2,2,2,...,4,,2,,,,,,7,
garages,1,1,1,1,2,2,1,1,1,1,...,0,0,0,0,,,,,0,0


In [406]:
ant = data
ant['count'] = 1
ant = ant.groupby(by='antiguedad').sum()[['count','habitaciones','id']]
#ant.sort_values(ascending=False).tail(5)
ant = ant.reset_index()
ant.head()

Unnamed: 0,antiguedad,count,habitaciones,id
0,0.0,50330,126555.0,7520635000.0
1,1.0,12353,32263.0,1857008000.0
2,2.0,5059,13537.0,769743600.0
3,3.0,5616,15068.0,843488600.0
4,4.0,7944,21356.0,1196237000.0


In [407]:
def hab_ant(row):
    #if(row['antiguedad'] <= 10 ):
        return row['count'] * 1
    #return row['habitaciones'] * 0.5

In [408]:
ant2 = ant
ant2['c_hab_ant'] = ant2.apply(hab_ant,axis=1)
ant2.head(15)

Unnamed: 0,antiguedad,count,habitaciones,id,c_hab_ant
0,0.0,50330,126555.0,7520635000.0,50330.0
1,1.0,12353,32263.0,1857008000.0,12353.0
2,2.0,5059,13537.0,769743600.0,5059.0
3,3.0,5616,15068.0,843488600.0,5616.0
4,4.0,7944,21356.0,1196237000.0,7944.0
5,5.0,33265,89649.0,4970438000.0,33265.0
6,6.0,3169,8520.0,476129100.0,3169.0
7,7.0,2046,5513.0,313093900.0,2046.0
8,8.0,2418,6760.0,371206700.0,2418.0
9,9.0,1065,2905.0,162469100.0,1065.0


In [409]:
data = data.drop(columns='count')

In [410]:
ant3 = ant2[['c_hab_ant','antiguedad']]
ant3.head()

train_set = train_set.merge(ant3,on = 'antiguedad',how = 'left')
train_set.transpose()

test_set = test_set.merge(ant3,on = 'antiguedad',how = 'left')
test_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59990,59991,59992,59993,59994,59995,59996,59997,59998,59999
id,4.941000e+03,5.177500e+04,1.152530e+05,2.993210e+05,1.735700e+05,3.086200e+04,2.444710e+05,1.277940e+05,7.155800e+04,2.180110e+05,...,2.056250e+05,2.842660e+05,7.024400e+04,5.977600e+04,7.910000e+04,7.509400e+04,1.718470e+05,1.383130e+05,2.712680e+05,7.261200e+04
tipodepropiedad_1,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00
tipodepropiedad_2,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
tipodepropiedad_3,0.000000e+00,0.000000e+00,1.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,...,1.000000e+00,1.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00
tipodepropiedad_4,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
tipodepropiedad_5,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
tipodepropiedad_6,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00
tipodepropiedad_7,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
tipodepropiedad_8,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
tipodepropiedad_9,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


# Se exporta para entrenar

## NO hace falta eliminar NULLs

In [411]:
len(train_set)

239954

In [412]:
train_set.to_csv("train_set_xgb.csv", index = False)

In [413]:
test_set.head()

Unnamed: 0,id,tipodepropiedad_1,tipodepropiedad_2,tipodepropiedad_3,tipodepropiedad_4,tipodepropiedad_5,tipodepropiedad_6,tipodepropiedad_7,tipodepropiedad_8,tipodepropiedad_9,...,metroscubiertos,garages,banos,lat,lng,metroscubiertos_mean_tipodeprop,metroscubiertos_var_tipodeprop,precio_mean_tipodeprop,precio_var_tipodeprop,c_hab_ant
0,4941,1,0,0,0,0,0,0,0,0,...,300.0,,4.0,19.408668,-99.246767,193.814888,9797.989886,2398158.0,3997227000000.0,196.0
1,51775,0,1,0,0,0,0,0,0,0,...,67.0,1.0,1.0,21.03248,-89.592424,116.138123,4425.701226,2763769.0,5488593000000.0,
2,115253,0,0,1,0,0,0,0,0,0,...,87.0,1.0,2.0,19.332829,-99.152913,116.138123,4425.701226,2763769.0,5488593000000.0,50330.0
3,299321,0,0,1,0,0,0,0,0,0,...,86.0,2.0,2.0,16.860487,-99.878383,116.138123,4425.701226,2763769.0,5488593000000.0,5059.0
4,173570,1,0,0,0,0,0,0,0,0,...,80.0,1.0,1.0,19.640482,-99.127273,193.814888,9797.989886,2398158.0,3997227000000.0,28844.0


In [414]:
len(test_set)

60000

In [415]:
test_set.to_csv("test_set_xgb.csv", index = False)