In [568]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [569]:
dtypes = {'id': 'int32', 
'titulo': 'object', 
'descripcion': 'object', 
'tipodepropiedad': 'category', 
'direccion': 'object', 
'ciudad': 'object', 
'provincia': 'category', 
'antiguedad': 'float', 
'habitaciones': 'float', 
'garages': 'float', 
'banos': 'float', 
'metroscubiertos': 'float', 
'metrostotales': 'float', 
'idzona': 'object', 
'lat': 'float64', 
'lng': 'float64', 
'gimnasio': 'float', 
'usosmultiples': 'float', 
'piscina': 'float', 
'escuelascercanas': 'float', 
'centroscomercialescercanos': 'float', 
'precio': 'float', }

data = pd.read_csv("train.csv", dtype = dtypes)
data.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,,2.0,1.0,...,23533.0,,,2015-08-23 00:00:00,0.0,0.0,0.0,0.0,0.0,2273000.0
1,53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,...,24514.0,19.310205,-99.227655,2013-06-28 00:00:00,0.0,0.0,0.0,1.0,1.0,3600000.0
2,247984,casa en venta urbi 3 recamaras tonala,descripcion \nla mejor ubicacion residencial e...,Casa,Urbi Tonala,Tonalá,Jalisco,5.0,3.0,2.0,...,48551.0,,,2015-10-17 00:00:00,0.0,0.0,0.0,0.0,0.0,1200000.0
3,209067,casa sola en toluca zinacantepec con credito i...,casa en privada con caseta de vigilancia casas...,Casa,IGNACIO MANUEL ALTAMIRANO 128,Zinacantepec,Edo. de México,1.0,2.0,1.0,...,53666.0,19.30189,-99.688015,2012-03-09 00:00:00,0.0,0.0,0.0,1.0,1.0,650000.0
4,185997,paseos del sol,bonito departamento en excelentes condiciones ...,Apartamento,PASEOS DEL SOL,Zapopan,Jalisco,10.0,2.0,1.0,...,47835.0,,,2016-06-07 00:00:00,0.0,0.0,0.0,0.0,0.0,1150000.0


# Pre-procesamiento de data

In [570]:
data["ciudad"].nunique()

875

In [571]:
data.isnull().sum()

id                                 0
titulo                          5387
descripcion                     1619
tipodepropiedad                   46
direccion                      53072
ciudad                           372
provincia                        155
antiguedad                     43555
habitaciones                   22471
garages                        37765
banos                          26221
metroscubiertos                17400
metrostotales                  51467
idzona                         28621
lat                           123488
lng                           123488
fecha                              0
gimnasio                           0
usosmultiples                      0
piscina                            0
escuelascercanas                   0
centroscomercialescercanos         0
precio                             0
dtype: int64

In [572]:
data.isnull().sum().sum()

535127

In [573]:
data.size

5520000

In [574]:
(data.isnull().sum().sum()/(data.size))*100

9.694329710144928

Los Nulls representan el 10% de los datos.

# CON XGBOOST NO HACE FALTA PREOCUPARSE POR LOS NULLs

# Levanto el csv de test para calcularle los features en paralelo

In [575]:
test = pd.read_csv("test.csv", dtype = dtypes)
test.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,metrostotales,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,4941,"casa en venta en miguel hidalgo, distrito federal",<p>excelente casa estilo moderno.</p>,Casa,Bosque de Cedros,Miguel Hidalgo,Distrito Federal,29.0,3.0,,...,,,19.408668,-99.246767,2013-07-20 00:00:00,0.0,0.0,0.0,0.0,0.0
1,51775,departamentos en venta en montebello,<p>departamento una recamara:\n</p><p>departam...,Apartamento,,Mérida,Yucatán,,1.0,1.0,...,67.0,113851.0,21.03248,-89.592424,2015-10-24 00:00:00,0.0,0.0,0.0,0.0,0.0
2,115253,departamento nuevo delegación coyoacán de 87 m...,"departamento nuevo de 87.06 m2, 1 cajón de est...",Apartamento,"Pueblo de los Reyes, Coyoacán, Mexico D.F.",Coyoacán,Distrito Federal,0.0,2.0,1.0,...,100.0,23620.0,19.332829,-99.152913,2015-05-30 00:00:00,0.0,0.0,0.0,0.0,1.0
3,299321,departamento en venta en acapulco,<p> raíces dv001 precioso departamento tipo k...,Apartamento,,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,...,86.0,129347.0,16.860487,-99.878383,2015-04-02 00:00:00,0.0,0.0,0.0,0.0,0.0
4,173570,bonita casa sola equipada de dos niveles en lo...,"<p>casa sola, bonita de dos rec&aacute;maras u...",Casa,CEDROS,Tultitlán,Edo. de México,10.0,2.0,1.0,...,76.0,57125.0,19.640482,-99.127273,2013-08-15 00:00:00,0.0,0.0,0.0,1.0,1.0


In [576]:
len(test)

60000

# Preparacion del set de datos de entrenamiento (features)

La idea es preparar el set con los datos para exportar y que despues el modelo simplemente levante, separe en set de entrenamiento y test, entrene y devuelva una prediccion.

Se procede a calcular features. Cada feature se agregara al DataFrame final que tendra que levantar despues el modelo. Ojo que dentro de este DataFrame tambien va a estar el precio (que es el label).

In [577]:
train_set = pd.DataFrame()

In [578]:
test_set = pd.DataFrame()

# Agrego las columnas que ya se encuentran

In [579]:
data.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio'],
      dtype='object')

In [580]:
train_set = data.loc[:, ['id','antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio']]

In [581]:
train_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,,,0.0,0.0,0.0,0.0,0.0,2273000.0
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,19.310205,-99.227655,0.0,0.0,0.0,1.0,1.0,3600000.0
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,,,0.0,0.0,0.0,0.0,0.0,1200000.0
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,19.30189,-99.688015,0.0,0.0,0.0,1.0,1.0,650000.0
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,,,0.0,0.0,0.0,0.0,0.0,1150000.0


### Lo mismo para test

In [582]:
# Se mete en test_set la columna 'id'; recordar de luego sacarla!
test_set = test.loc[:, ['id', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos']]

In [583]:
test_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,4941,29.0,3.0,,4.0,300.0,,,19.408668,-99.246767,0.0,0.0,0.0,0.0,0.0
1,51775,,1.0,1.0,1.0,67.0,67.0,113851.0,21.03248,-89.592424,0.0,0.0,0.0,0.0,0.0
2,115253,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,19.332829,-99.152913,0.0,0.0,0.0,0.0,1.0
3,299321,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,16.860487,-99.878383,0.0,0.0,0.0,0.0,0.0
4,173570,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,19.640482,-99.127273,0.0,0.0,0.0,1.0,1.0


# Nuevos features

## OJO!!! Pensar muy bien si tiene sentido agregar ese feature o no... Porque agregar de más puede ser muy malo a priori. Lo más simple suele ser lo mejor!!!

## Fecha de publicación

In [584]:
data['fecha'] = pd.to_datetime(data['fecha'])
data['anio'] = data['fecha'].dt.year

In [585]:
data['anio'].value_counts()

2016    94038
2015    51470
2014    40572
2013    30386
2012    23534
Name: anio, dtype: int64

In [586]:
train_set['anio_publ'] = data['anio']

In [587]:
test_set['anio_publ'] = pd.to_datetime(test['fecha']).dt.year

### Feature total de antiguedad por anio

In [588]:
data.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,239990,239991,239992,239993,239994,239995,239996,239997,239998,239999
id,254099,53461,247984,209067,185997,126147,139233,5013,44962,134537,...,87498,137337,54886,207892,110268,119879,259178,131932,146867,121958
titulo,depto. tipo a-402,condominio horizontal en venta,casa en venta urbi 3 recamaras tonala,casa sola en toluca zinacantepec con credito i...,paseos del sol,departamento en venta taxqueña,de oportunidad casa en san lorenzo,casa emilia en venta en selvamar playa del carmen,pre- venta preciosos depas 2 recamaras con sub...,terreno,...,casa en venta: bosques del contry,departamento residencial coyuya,casa en venta,bugambilias (ciudad),hermosa casa en villa de los belenes,bonita casas de 2 recamaras a 10 minutos del c...,casa en condominio a 10 min. del centro de toluca,nicolas san juan,casa sola. javier rojo gomez.,departamento en bosques de las lomas / av. st...
descripcion,"depto. interior de 80.15m2, consta de sala com...","<p>entre sonora y guerrero, atr&aacute;s del h...",descripcion \nla mejor ubicacion residencial e...,casa en privada con caseta de vigilancia casas...,bonito departamento en excelentes condiciones ...,"amplio departamento, estancia de sala y comedo...","ubicada en esquina, pertenece san lorenzo agen...",casa emilia en venta playa del carmenfracciona...,<p>pre-venta de preciosos departamento ecologi...,"terreno de 5.500m2 bardeado, uso de suelo h-20...",...,"<p>casa en venta, en magníficas condiciones; e...","departamento ubicado en planta baja, con excel...",bonita casa para remodelar en una calle cerrad...,coto privado de tan solo 7 casas donde cada fa...,"<p>moderna casa 3 pisos, muro llor&oacute;n , ...",vendo casa en bosques de ica residencial a 10 ...,"casa con un jardin amplio, un cuarto de servic...","departamento con excelente ubicación, muy cerc...","casa sola, dividida en cuatro departamentos de...","id:19816, muy bonito e iluminado departamento,..."
tipodepropiedad,Apartamento,Casa en condominio,Casa,Casa,Apartamento,Apartamento,Casa,Casa,Apartamento,Terreno,...,Casa,Apartamento,Casa en condominio,Casa,Casa,Casa,Casa,Apartamento,Casa,Apartamento
direccion,Avenida Division del Norte 2005,AV. MEXICO,Urbi Tonala,IGNACIO MANUEL ALTAMIRANO 128,PASEOS DEL SOL,Condominio Tlalpan 2B,,condominio el trebol,BUENAVISTA DEPTOS CON SUBSIDIO,Av. Morelos,...,,Coyuya 200,Cerrada villa Picadilly,Paseo de la Cañada,"MANUEL AMAYA,ENTRE SEBASTIAN ALLENDE Y AMADO A...",BOSQUES,Filiberto Navas 325,Nicolas San Juan,Javier Rojo Gomez 120,AVE. STIM
ciudad,Benito Juárez,La Magdalena Contreras,Tonalá,Zinacantepec,Zapopan,Coyoacán,Oaxaca de Juárez,Playa del Carmen,Villa de Alvarez,Ixtapaluca,...,Guadalupe,Iztacalco,Huixquilucan,Zapopan,Zapopan,Zinacantepec,Toluca,Benito Juárez,Iztapalapa,Cuajimalpa de Morelos
provincia,Distrito Federal,Distrito Federal,Jalisco,Edo. de México,Jalisco,Distrito Federal,Oaxaca,Quintana Roo,Colima,Edo. de México,...,Nuevo León,Distrito Federal,Edo. de México,Jalisco,Jalisco,Edo. de México,Edo. de México,Distrito Federal,Distrito Federal,Distrito Federal
antiguedad,,10,5,1,10,5,,2,1,,...,20,20,10,1,3,0,0,20,20,1
habitaciones,2,3,3,2,2,2,3,4,2,,...,3,2,3,3,2,2,3,2,4,3
garages,1,2,2,1,1,1,1,2,1,,...,2,1,,2,2,2,3,1,0,2


In [589]:
ant = data
ant['count'] = 1
ant = ant.groupby(by='antiguedad').sum()[['count','habitaciones','id']]
#ant.sort_values(ascending=False).tail(5)
ant = ant.reset_index()
ant.head()

Unnamed: 0,antiguedad,count,habitaciones,id
0,0.0,50335,126567.0,7521382000.0
1,1.0,12353,32263.0,1857008000.0
2,2.0,5059,13537.0,769743600.0
3,3.0,5616,15068.0,843488600.0
4,4.0,7944,21356.0,1196237000.0


In [590]:
def hab_ant(row):
    if(row['antiguedad'] <= 10 ):
        return row['count'] * 1
    return row['habitaciones'] * 0.5

In [591]:
ant2 = ant
ant2['c_hab_ant'] = ant2.apply(hab_ant,axis=1)
ant2.head(15)

Unnamed: 0,antiguedad,count,habitaciones,id,c_hab_ant
0,0.0,50335,126567.0,7521382000.0,50335.0
1,1.0,12353,32263.0,1857008000.0,12353.0
2,2.0,5059,13537.0,769743600.0,5059.0
3,3.0,5616,15068.0,843488600.0,5616.0
4,4.0,7944,21356.0,1196237000.0,7944.0
5,5.0,33268,89658.0,4970856000.0,33268.0
6,6.0,3169,8520.0,476129100.0,3169.0
7,7.0,2046,5513.0,313093900.0,2046.0
8,8.0,2418,6760.0,371206700.0,2418.0
9,9.0,1065,2905.0,162469100.0,1065.0


In [592]:
data = data.drop(columns='count')

In [593]:
ant3 = ant2[['c_hab_ant','antiguedad']]
ant3.head()

Unnamed: 0,c_hab_ant,antiguedad
0,50335.0,0.0
1,12353.0,1.0
2,5059.0,2.0
3,5616.0,3.0
4,7944.0,4.0


In [594]:
train_set = train_set.merge(ant3,on = 'antiguedad',how = 'left')
train_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,239990,239991,239992,239993,239994,239995,239996,239997,239998,239999
id,254099.0,53461.0,247984.0,209067.0,185997.0,126147.0,139233.0,5013.0,44962.0,134537.0,...,87498.0,137337.0,54886.0,207892.0,110268.0,119879.0,259178.0,131932.0,146867.0,121958.0
antiguedad,,10.0,5.0,1.0,10.0,5.0,,2.0,1.0,,...,20.0,20.0,10.0,1.0,3.0,0.0,0.0,20.0,20.0,1.0
habitaciones,2.0,3.0,3.0,2.0,2.0,2.0,3.0,4.0,2.0,,...,3.0,2.0,3.0,3.0,2.0,2.0,3.0,2.0,4.0,3.0
garages,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,,...,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,0.0,2.0
banos,2.0,2.0,2.0,1.0,1.0,1.0,2.0,3.0,1.0,,...,1.0,1.0,3.0,3.0,2.0,1.0,3.0,2.0,4.0,2.0
metroscubiertos,80.0,268.0,144.0,63.0,95.0,75.0,140.0,293.0,58.0,250.0,...,118.0,68.0,270.0,150.0,160.0,67.0,200.0,138.0,235.0,149.0
metrostotales,80.0,180.0,166.0,67.0,95.0,90.0,160.0,293.0,,,...,124.0,68.0,293.0,150.0,90.0,,250.0,138.0,137.0,
idzona,23533.0,24514.0,48551.0,53666.0,47835.0,23650.0,73510.0,130510.0,9010.0,59171.0,...,72083.0,24118.0,55589.0,48046.0,48029.0,53666.0,51954.0,50003995.0,24162.0,23750.0
lat,,19.3102,,19.3019,,19.3006,17.1435,20.6726,,19.316,...,25.6612,19.4033,,20.6156,20.6045,,19.2947,,19.3667,
lng,,-99.2277,,-99.688,,-99.1485,-96.8035,-87.038,,-98.887,...,-100.251,-99.119,,-103.456,-103.45,,-99.6929,,-99.0822,


In [595]:
test_set = test_set.merge(ant3,on = 'antiguedad',how = 'left')
test_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59990,59991,59992,59993,59994,59995,59996,59997,59998,59999
id,4941.0,51775.0,115253.0,299321.0,173570.0,30862.0,244471.0,127794.0,71558.0,218011.0,...,205625.0,284266.0,70244.0,59776.0,79100.0,75094.0,171847.0,138313.0,271268.0,72612.0
antiguedad,29.0,,0.0,2.0,10.0,10.0,20.0,0.0,2.0,20.0,...,,5.0,6.0,,,20.0,10.0,5.0,0.0,0.0
habitaciones,3.0,1.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,...,3.0,2.0,3.0,,,4.0,3.0,3.0,2.0,3.0
garages,,1.0,1.0,2.0,1.0,1.0,,1.0,2.0,1.0,...,,1.0,3.0,0.0,,3.0,1.0,2.0,1.0,2.0
banos,4.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,...,3.0,2.0,2.0,,,3.0,2.0,2.0,2.0,2.0
metroscubiertos,300.0,67.0,87.0,86.0,80.0,165.0,220.0,62.0,260.0,123.0,...,316.0,68.0,270.0,,199.0,291.0,71.0,102.0,130.0,211.0
metrostotales,,67.0,100.0,86.0,76.0,138.0,190.0,62.0,200.0,160.0,...,,68.0,170.0,200.0,199.0,,87.0,,144.0,130.0
idzona,,113851.0,23620.0,129347.0,57125.0,48216.0,323485.0,54688.0,107969.0,50002836.0,...,55552.0,50003995.0,24940.0,49037.0,24597.0,275741.0,57474.0,72224.0,83960.0,47747.0
lat,19.4087,21.0325,19.3328,16.8605,19.6405,,,,19.17,21.158,...,,,19.2812,20.5312,19.4409,19.4348,,,20.5918,20.656
lng,-99.2468,-89.5924,-99.1529,-99.8784,-99.1273,,,,-96.1527,-86.8385,...,,,-99.1422,-103.487,-99.1885,-99.0925,,,-100.328,-103.427


### Feature: antiguedad por provincia

In [596]:
data.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,239990,239991,239992,239993,239994,239995,239996,239997,239998,239999
id,254099,53461,247984,209067,185997,126147,139233,5013,44962,134537,...,87498,137337,54886,207892,110268,119879,259178,131932,146867,121958
titulo,depto. tipo a-402,condominio horizontal en venta,casa en venta urbi 3 recamaras tonala,casa sola en toluca zinacantepec con credito i...,paseos del sol,departamento en venta taxqueña,de oportunidad casa en san lorenzo,casa emilia en venta en selvamar playa del carmen,pre- venta preciosos depas 2 recamaras con sub...,terreno,...,casa en venta: bosques del contry,departamento residencial coyuya,casa en venta,bugambilias (ciudad),hermosa casa en villa de los belenes,bonita casas de 2 recamaras a 10 minutos del c...,casa en condominio a 10 min. del centro de toluca,nicolas san juan,casa sola. javier rojo gomez.,departamento en bosques de las lomas / av. st...
descripcion,"depto. interior de 80.15m2, consta de sala com...","<p>entre sonora y guerrero, atr&aacute;s del h...",descripcion \nla mejor ubicacion residencial e...,casa en privada con caseta de vigilancia casas...,bonito departamento en excelentes condiciones ...,"amplio departamento, estancia de sala y comedo...","ubicada en esquina, pertenece san lorenzo agen...",casa emilia en venta playa del carmenfracciona...,<p>pre-venta de preciosos departamento ecologi...,"terreno de 5.500m2 bardeado, uso de suelo h-20...",...,"<p>casa en venta, en magníficas condiciones; e...","departamento ubicado en planta baja, con excel...",bonita casa para remodelar en una calle cerrad...,coto privado de tan solo 7 casas donde cada fa...,"<p>moderna casa 3 pisos, muro llor&oacute;n , ...",vendo casa en bosques de ica residencial a 10 ...,"casa con un jardin amplio, un cuarto de servic...","departamento con excelente ubicación, muy cerc...","casa sola, dividida en cuatro departamentos de...","id:19816, muy bonito e iluminado departamento,..."
tipodepropiedad,Apartamento,Casa en condominio,Casa,Casa,Apartamento,Apartamento,Casa,Casa,Apartamento,Terreno,...,Casa,Apartamento,Casa en condominio,Casa,Casa,Casa,Casa,Apartamento,Casa,Apartamento
direccion,Avenida Division del Norte 2005,AV. MEXICO,Urbi Tonala,IGNACIO MANUEL ALTAMIRANO 128,PASEOS DEL SOL,Condominio Tlalpan 2B,,condominio el trebol,BUENAVISTA DEPTOS CON SUBSIDIO,Av. Morelos,...,,Coyuya 200,Cerrada villa Picadilly,Paseo de la Cañada,"MANUEL AMAYA,ENTRE SEBASTIAN ALLENDE Y AMADO A...",BOSQUES,Filiberto Navas 325,Nicolas San Juan,Javier Rojo Gomez 120,AVE. STIM
ciudad,Benito Juárez,La Magdalena Contreras,Tonalá,Zinacantepec,Zapopan,Coyoacán,Oaxaca de Juárez,Playa del Carmen,Villa de Alvarez,Ixtapaluca,...,Guadalupe,Iztacalco,Huixquilucan,Zapopan,Zapopan,Zinacantepec,Toluca,Benito Juárez,Iztapalapa,Cuajimalpa de Morelos
provincia,Distrito Federal,Distrito Federal,Jalisco,Edo. de México,Jalisco,Distrito Federal,Oaxaca,Quintana Roo,Colima,Edo. de México,...,Nuevo León,Distrito Federal,Edo. de México,Jalisco,Jalisco,Edo. de México,Edo. de México,Distrito Federal,Distrito Federal,Distrito Federal
antiguedad,,10,5,1,10,5,,2,1,,...,20,20,10,1,3,0,0,20,20,1
habitaciones,2,3,3,2,2,2,3,4,2,,...,3,2,3,3,2,2,3,2,4,3
garages,1,2,2,1,1,1,1,2,1,,...,2,1,,2,2,2,3,1,0,2


In [597]:
#Mergea el la columna indicada en 'on', tomando el df pasado como parámetro(df_to_use)
#how_feature indica que tipo de join se hace.
#El merge lo hace sobre 'train_set' y 'test_set'
def merge_df(df_to_use, on_feature,how_feature):
    print("dataframe a usar:"+str(df_to_use))
    print("feature a usar:"+on_feature)
   # print("[test_set] Cantidad init: "+str(len(test_set.columns)))
    #print("[train_set] Cantidad init: "+str(len(train_set.columns)))
    test_set.merge(df_to_use, on = on_feature, how = how_feature)
    train_set.merge(df_to_use, on = on_feature, how = how_feature)
    #print("[test_set] Cantidad init: "+str(len(test_set.columns)))
    #print("[train_set] Cantidad init: "+str(len(train_set.columns)))

In [598]:
df = data.groupby(by = 'provincia').agg({'antiguedad':'mean'}).reset_index()
df.sort_values(['antiguedad'], ascending = False, inplace = True)
df=df.rename(columns={"antiguedad":"antiguedad_prov"})
df.head()

data = data.merge(df,on='provincia',how='inner')
data.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,239835,239836,239837,239838,239839,239840,239841,239842,239843,239844
id,254099,53461,126147,130216,73348,177031,202245,92610,116928,244353,...,163941,226592,275038,201744,194644,3654,190742,191824,57022,132311
titulo,depto. tipo a-402,condominio horizontal en venta,departamento en venta taxqueña,casa matias romero - sión bancaria,"oportunidad, departamento col del valle, 3 rec...",,casa en venta en tlalpan,"casa en venta en agrícola oriental, iztacalco",excelentes departamentos con exclente ubicación,,...,"casa habitacion en guadalupe, zac","edificio en residencial boulevares, zac. (ven...","excelente casa seminueva en fracc. la cañada, ...",casa en zacatecas seminueva fraccionamiento lo...,se vende hermosa casa en lomas del pedregal,casa en venta en calera,terreno en venta en zacatecas,casa en condominio en venta en guadalupe,"local y departamento en venta en sombrerete, zac.","casa en venta en san cosme, guadalupe"
descripcion,"depto. interior de 80.15m2, consta de sala com...","<p>entre sonora y guerrero, atr&aacute;s del h...","amplio departamento, estancia de sala y comedo...",flamante casa habitación en condominio horizon...,magnifico departamento con excelente distribuc...,"hermosa casa remodelada, con acabados de lujo,...","hermosa residencia con acabados de lujo, ¡opor...",<p>casa duplex (el primer nivel es el que se e...,"1 pb, 2 primer piso, 2 segundo piso. cada depa...","departamento nuevo, 98m2 ,interior, 2 recamara...",...,"casa habitacion en privada, de una planta y co...",<table cellspacing=0 cellpadding=0 align=left>...,"casa seminueva, con buenos acabados y amplios ...","sala (doble altura), comedor, cocina, 3 recama...","casa con diseño minimalista, nueva en fraccion...",nocnok id: mx14-aw0878. la vivienda se encuent...,"<p>terreno en venta en zacatecas, ubicado en ...","casa dúplex planta baja en una esquina, 3 habi...",<table cellspacing=0 cellpadding=0 align=left>...,"<p>lote 6 y 16 del fracc. san cosme, guadalupe..."
tipodepropiedad,Apartamento,Casa en condominio,Apartamento,Casa,Apartamento,Casa en condominio,Casa,Casa,Apartamento,Apartamento,...,Casa,Edificio,Casa en condominio,Casa,Casa,Casa,Terreno,Casa en condominio,Edificio,Casa
direccion,Avenida Division del Norte 2005,AV. MEXICO,Condominio Tlalpan 2B,"MATIAS ROMERO, COLONIA DEL VALLE",Pazaje Santa Cruz,PASEO DE LOS LAURELES,Galeana,Sur 16,Jesús Carranza No. 55,ILLINOIS 31 602,...,mina valenciana,"CALLE UNO, BOULEVARES, ZACATECAS, ZAC.","Roca, Fracc. La Cañada 59",Cerro del Ángel 22,Circuito del Pedregal #94,,,Andador Capulines 1 A,"JARDIN HIDALGO #204, ZONA CENTRO, SOMBRERETE, ...",LOTE 6 Y 16
ciudad,Benito Juárez,La Magdalena Contreras,Coyoacán,Benito Juárez,Benito Juárez,Miguel Hidalgo,Tlalpan,Iztacalco,Cuauhtémoc,Benito Juárez,...,Guadalupe,Zacatecas,Guadalupe,Guadalupe,Guadalupe,Calera,Jerez,Guadalupe,Sombrerete,Guadalupe
provincia,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,Distrito Federal,...,Zacatecas,Zacatecas,Zacatecas,Zacatecas,Zacatecas,Zacatecas,Zacatecas,Zacatecas,Zacatecas,Zacatecas
antiguedad,,10,5,20,5,18,2,16,20,0,...,0,10,3,3,2,,6,20,10,10
habitaciones,2,3,2,,3,3,3,3,2,2,...,2,,3,3,3,3,,3,,4
garages,1,2,1,0,2,3,2,1,0,2,...,1,0,1,1,2,,,1,0,0


In [599]:
train_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,anio_publ,c_hab_ant
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,,,0.0,0.0,0.0,0.0,0.0,2273000.0,2015,
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,19.310205,-99.227655,0.0,0.0,0.0,1.0,1.0,3600000.0,2013,28844.0
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,,,0.0,0.0,0.0,0.0,0.0,1200000.0,2015,33268.0
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,19.30189,-99.688015,0.0,0.0,0.0,1.0,1.0,650000.0,2012,12353.0
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,,,0.0,0.0,0.0,0.0,0.0,1150000.0,2016,28844.0


In [600]:
train_set = train_set.merge(data[['id','antiguedad_prov']], on = 'id', how = 'left')
test_set = test_set.merge(data[['id','antiguedad_prov']], on = 'id', how = 'left')

train_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,239990,239991,239992,239993,239994,239995,239996,239997,239998,239999
id,254099.0,53461.0,247984.0,209067.0,185997.0,126147.0,139233.0,5013.0,44962.0,134537.0,...,87498.0,137337.0,54886.0,207892.0,110268.0,119879.0,259178.0,131932.0,146867.0,121958.0
antiguedad,,10.0,5.0,1.0,10.0,5.0,,2.0,1.0,,...,20.0,20.0,10.0,1.0,3.0,0.0,0.0,20.0,20.0,1.0
habitaciones,2.0,3.0,3.0,2.0,2.0,2.0,3.0,4.0,2.0,,...,3.0,2.0,3.0,3.0,2.0,2.0,3.0,2.0,4.0,3.0
garages,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,,...,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,0.0,2.0
banos,2.0,2.0,2.0,1.0,1.0,1.0,2.0,3.0,1.0,,...,1.0,1.0,3.0,3.0,2.0,1.0,3.0,2.0,4.0,2.0
metroscubiertos,80.0,268.0,144.0,63.0,95.0,75.0,140.0,293.0,58.0,250.0,...,118.0,68.0,270.0,150.0,160.0,67.0,200.0,138.0,235.0,149.0
metrostotales,80.0,180.0,166.0,67.0,95.0,90.0,160.0,293.0,,,...,124.0,68.0,293.0,150.0,90.0,,250.0,138.0,137.0,
idzona,23533.0,24514.0,48551.0,53666.0,47835.0,23650.0,73510.0,130510.0,9010.0,59171.0,...,72083.0,24118.0,55589.0,48046.0,48029.0,53666.0,51954.0,50003995.0,24162.0,23750.0
lat,,19.3102,,19.3019,,19.3006,17.1435,20.6726,,19.316,...,25.6612,19.4033,,20.6156,20.6045,,19.2947,,19.3667,
lng,,-99.2277,,-99.688,,-99.1485,-96.8035,-87.038,,-98.887,...,-100.251,-99.119,,-103.456,-103.45,,-99.6929,,-99.0822,


In [601]:
test_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59990,59991,59992,59993,59994,59995,59996,59997,59998,59999
id,4941.0,51775.0,115253.0,299321.0,173570.0,30862.0,244471.0,127794.0,71558.0,218011.0,...,205625.0,284266.0,70244.0,59776.0,79100.0,75094.0,171847.0,138313.0,271268.0,72612.0
antiguedad,29.0,,0.0,2.0,10.0,10.0,20.0,0.0,2.0,20.0,...,,5.0,6.0,,,20.0,10.0,5.0,0.0,0.0
habitaciones,3.0,1.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,...,3.0,2.0,3.0,,,4.0,3.0,3.0,2.0,3.0
garages,,1.0,1.0,2.0,1.0,1.0,,1.0,2.0,1.0,...,,1.0,3.0,0.0,,3.0,1.0,2.0,1.0,2.0
banos,4.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,...,3.0,2.0,2.0,,,3.0,2.0,2.0,2.0,2.0
metroscubiertos,300.0,67.0,87.0,86.0,80.0,165.0,220.0,62.0,260.0,123.0,...,316.0,68.0,270.0,,199.0,291.0,71.0,102.0,130.0,211.0
metrostotales,,67.0,100.0,86.0,76.0,138.0,190.0,62.0,200.0,160.0,...,,68.0,170.0,200.0,199.0,,87.0,,144.0,130.0
idzona,,113851.0,23620.0,129347.0,57125.0,48216.0,323485.0,54688.0,107969.0,50002836.0,...,55552.0,50003995.0,24940.0,49037.0,24597.0,275741.0,57474.0,72224.0,83960.0,47747.0
lat,19.4087,21.0325,19.3328,16.8605,19.6405,,,,19.17,21.158,...,,,19.2812,20.5312,19.4409,19.4348,,,20.5918,20.656
lng,-99.2468,-89.5924,-99.1529,-99.8784,-99.1273,,,,-96.1527,-86.8385,...,,,-99.1422,-103.487,-99.1885,-99.0925,,,-100.328,-103.427


### Usos multiples piscina gimnasio según tipo de casa

In [602]:
data.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,...,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,anio,antiguedad_prov
0,254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,,2.0,1.0,...,,2015-08-23,0.0,0.0,0.0,0.0,0.0,2273000.0,2015,12.189608
1,53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,...,-99.227655,2013-06-28,0.0,0.0,0.0,1.0,1.0,3600000.0,2013,12.189608
2,126147,departamento en venta taxqueña,"amplio departamento, estancia de sala y comedo...",Apartamento,Condominio Tlalpan 2B,Coyoacán,Distrito Federal,5.0,2.0,1.0,...,-99.148475,2014-03-18,0.0,0.0,0.0,0.0,1.0,1100000.0,2014,12.189608
3,130216,casa matias romero - sión bancaria,flamante casa habitación en condominio horizon...,Casa,"MATIAS ROMERO, COLONIA DEL VALLE",Benito Juárez,Distrito Federal,20.0,,0.0,...,-99.149519,2015-03-31,0.0,0.0,0.0,0.0,0.0,3000000.0,2015,12.189608
4,73348,"oportunidad, departamento col del valle, 3 rec...",magnifico departamento con excelente distribuc...,Apartamento,Pazaje Santa Cruz,Benito Juárez,Distrito Federal,5.0,3.0,2.0,...,,2016-10-30,0.0,0.0,0.0,1.0,1.0,4750000.0,2016,12.189608


### ¿Qué relación existe entre el precio, los metros cuadrados y la cantidad de habitaciones de los departamentos?

In [603]:
# Seteamos el precio en dolares
prop = data
cotizacion_hoy = 19.55
prop['precio_USD'] = prop['precio'] / cotizacion_hoy

print(prop['precio_USD'].head(2))
print(prop['precio'].head(2))

prop['precioXmt'] = prop['precio_USD']/prop['metrostotales']

0    116265.984655
1    184143.222506
Name: precio_USD, dtype: float64
0    2273000.0
1    3600000.0
Name: precio, dtype: float64


In [604]:
deptos = prop[prop['tipodepropiedad'] == 'Apartamento']
deptos[['antiguedad','precioXmt','metrostotales','precio_USD','habitaciones']].describe()
# Vemos que el 75% de los departamentos tiene hasta 3 habitaciones

Unnamed: 0,antiguedad,precioXmt,metrostotales,precio_USD,habitaciones
count,48282.0,29334.0,29334.0,57300.0,56311.0
mean,8.362765,1162.81807,117.8904,141368.646323,2.386496
std,9.524558,746.274367,67.68525,119853.1032,0.630942
min,0.0,44.757033,15.0,15856.777494,1.0
25%,1.0,660.311555,70.0,53708.439898,2.0
50%,5.0,1037.677262,97.0,101747.519182,2.0
75%,10.0,1515.582078,145.0,190899.936061,3.0
max,80.0,37510.656436,438.0,640409.207161,10.0


In [605]:
(prop.groupby('habitaciones').count()['id'])/prop['habitaciones'].count()*100
r = deptos[['id','precioXmt','metrostotales','habitaciones']]
# Sacamos los NaN
r = r[~r['metrostotales'].isna()]
# Sacamos los que tienen menos de max_hab habitaciones, ya que son casos aislados
max_hab = 5
r = r[r['habitaciones'] < max_hab]
r.head()

Unnamed: 0,id,precioXmt,metrostotales,habitaciones
0,254099,1453.324808,80.0,2.0
2,126147,625.177607,90.0,2.0
4,73348,1913.124031,127.0,3.0
8,116928,511.508951,68.0,2.0
10,198249,897.861457,47.0,2.0


In [606]:
desc=r.sort_values(by=['precioXmt'],ascending=False).describe()

In [607]:
desc

Unnamed: 0,id,precioXmt,metrostotales,habitaciones
count,28793.0,28793.0,28793.0,28793.0
mean,150699.235856,1164.850604,117.416212,2.374952
std,86492.61404,745.488473,67.093628,0.595669
min,9.0,44.757033,15.0,1.0
25%,76008.0,662.385692,70.0,2.0
50%,151251.0,1039.976037,97.0,2.0
75%,225831.0,1518.542199,144.0,3.0
max,299994.0,37510.656436,438.0,4.0


In [608]:
mean_precioxmt = desc.loc['mean','precioXmt']

In [609]:
def factor_precioXmt(row):
    print(row)
    if(row > mean_precioxmt):
        return 1
    return 0.5

In [610]:
r.iloc[0,0] > mean_precioxmt

True

In [611]:
r['factor_precioXmt'] = r['precioXmt'].apply(factor_precioXmt)
r.transpose().head()

1453.3248081841432
625.177607274794
1913.1240308516421
511.50895140664966
897.861457256353
1790.2813299232737
582.9697607943433
796.3612417109514
1760.9324556622362
563.9714079611778
1638.2460789449358
886.6155157715259
1552.072800808898
1065.6436487638534
2097.4573404240923
1842.9366631563112
3239.556692242114
866.4335299337126
1378.8502168353164
621.1180124223603
1023.0179028132993
1046.2683096954197
1886.7133453523961
4049.4458653026427
1680.7729099544956
853.9598595517793
468.0789460985378
1023.0179028132992
1162.5203441060219
1078.9641943734016
415.50265591186303
2794.8427241806626
1937.5339068433698
1148.2854011169686
782.3078080336993
1627.5284817484305
1377.1394845563643
823.7287009665527
639.386189258312
890.0255754475703
1491.9011082693946
1784.3920204603578
1867.007672634271
1790.2813299232735
1495.9224050571827
683.3109243697478
1339.6663013031298
3551.240490657631
2355.8640847643405
2250.6393861892584
841.2225902477883
459.16112531969304
1560.1023017902812
1985.85828193169

2219.245185997104
1193.5208866155158
1598.46547314578
2127.2997277452355
1417.4344436569809
1238.5823894775301
1271.099744245524
1492.4025876335188
727.9165846940782
1680.8422054362695
944.324217981507
690.1311249137336
2534.911617590476
1875.532821824382
1851.95572957925
1558.8844233345512
1338.8746803069052
1364.0238704177323
966.7519181585677
2629.9280992134345
3288.271830471319
1478.9280551540087
645.1099744245523
984.098743467141
2527.4559951857977
1587.9680879490018
2210.223864102807
934.0598243077949
2046.0358056265984
728.6906209383254
2311.0344190059473
1177.3922259855763
1227.621483375959
1132.6269638290098
1712.449762513701
2234.6547314578006
940.0868657213334
771.198111351564
1234.67677925743
557.4136008918617
1058.768528449248
1587.4415733309816
1143.372950203099
520.0341005967605
831.0997442455242
1406.6496163682864
1343.7947028479778
1149.0418473627637
542.1994884910486
1416.4863269722605
779.7392551930634
1108.2693947144073
1499.2503748125937
1432.225063938619
946.62370

468.8832054560955
1255.5219716345036
673.0380939561179
2872.8584942017305
2841.716396703609
1575.860910899274
1743.780516159033
1329.9232736572887
2156.361265733915
1266.5935939593228
1029.930185940416
847.1867007672633
1267.1471750755638
1860.0325505696349
1590.6680805938495
809.8891730605285
867.7383997077092
1877.6910874421314
687.6953680022733
1782.5311942959
730.7270734380709
415.56156375593713
1495.1800118040526
2232.7771688385496
1477.6925262858765
599.4245524296675
699.0622335890878
1918.645719157228
3145.303490016044
1455.6083302886373
563.5268108717326
1390.945394175977
698.4742922656319
2499.9099456071467
1321.3981244671781
1585.677749360614
883.5154615205765
1056.3771822528633
727.7566381801926
1440.4562424670019
1514.3357114012654
813.2992327365729
2088.6615515771527
1023.0179028132993
615.7052192857818
444.34110930274613
748.9018296281723
1925.6807582367985
2527.4559951857977
511.50895140664954
1334.3711775825643
1065.6436487638534
666.0272804774083
1923.5315609619806
644

1462.9156010230179
606.8750270926352
2357.354835654068
1338.250163563909
1323.1113221128076
1370.113262696383
1989.2014776925262
336.4592213697073
1739.1304347826085
3943.049318323701
852.5149190110826
2365.173913043478
827.7144850034875
2119.1085129704056
876.8724881256851
1631.5371725901755
2799.901902392881
1091.219096334186
953.2666821669378
2185.538246919321
1712.159962799349
1308.001461454147
841.6647291327598
169.37382496908927
548.0453050785532
1704.7058823529412
1936.739021137998
1878.1969309462916
575.1187431494336
913.4088417975886
1133.3433629206158
1045.5844741988867
413.1418453669093
1411.0591762942058
2027.9589545256065
1583.5164439321347
1395.0244129272262
1156.1503696177697
1587.4415733309816
708.1557260585394
993.2668719661777
1989.2014776925264
647.9113384484228
2822.1183525884117
1956.5217391304348
577.5101064268625
2666.518403202491
696.7104682952641
786.9368483179225
1568.6274509803923
1884.2710997442452
1881.312583987169
2102.0915811232176
2088.6615515771527
613.

1249.8968731952807
1190.3978121597327
1852.5961109081313
1445.5687757144444
1666.7145607632403
1417.1676575768647
2282.116860121975
926.0075844430725
461.38107416879797
700.6971937077392
1316.3833308259364
1392.4410343847683
1358.695652173913
962.8403791183993
2317.2701574909142
1841.4322250639386
1902.622080933145
564.2836844882881
2078.005115089514
1156.984532943612
429.66751918158565
1172.5359039937046
2420.042350741138
1089.7364616924274
863.1713554987213
1892.5831202046036
622.1054814405197
489.74261304891985
347.82608695652175
677.9034295750778
808.101514853433
1492.032264410781
645.1099744245523
572.8900255754476
1391.6052354445615
3605.3462800005777
1195.3741799177137
1776.4981545859687
1412.739008646937
1143.372950203099
1136.6865586814436
2785.4447848876957
1346.0761879122358
1424.917793204238
2192.1812203142126
3098.6504527980183
2046.0358056265984
1346.9735720375106
407.31268352751727
2080.7095359883083
814.1199204319408
562.6598465473146
2397.69820971867
1150.8951406649617

913.4088417975886
1845.444059976932
2348.7655931937993
827.4409508048744
776.0825469618133
1482.2134387351778
1065.6436487638534
1687.9795396419438
2662.187028657617
991.2964174547473
1110.8640053374845
1062.3647452291955
817.3023462693207
468.8832054560955
403.9022449559534
804.5477795861427
1269.9532586647852
2114.523340036313
1841.4322250639386
967.774936061381
873.8277919863597
795.6805910770105
1747.6555839727193
1747.167797175581
1849.3015935471178
1950.9476031215158
452.6854219948849
1091.219096334186
634.9766293323927
1632.117377843586
630.1197227473219
1805.8761150271348
2036.4748906470347
669.2237851662404
986.4815491413957
1274.2053343076361
1475.1131221719456
1302.0227853987446
2995.9810010960905
358.0562659846547
1278.772378516624
2473.3457526864745
968.45694799659
1697.8658471061058
2941.176470588235
1741.3070686183814
2036.0897982381357
1366.963577035012
678.4223987077669
1136.6865586814436
1355.9637293652638
1099.7442455242965
715.3132992327365
296.7516431537552
850.599

852.5149190110826
1240.0217003797566
895.1406649616367
2297.455459707833
2036.9825498494897
742.7109974424552
843.0425310220706
1228.9500780549374
1684.9706634571987
2046.0358056265984
2226.0230179028135
944.324217981507
2557.544757033248
2323.511894334863
1720.3904671935363
1023.0179028132992
1924.8889487144972
1751.3621705771154
3580.562659846547
881.9119851838786
971.8670076726343
1480.900915766026
1203.550473897999
330.18341249543295
1861.8925831202046
1725.4400481420191
1314.7552606923098
1545.646614033137
955.5661729574773
1278.772378516624
2316.2669497659604
1612.8660629939402
1544.1639793913785
767.2634271099745
381.60191612877037
479.539641943734
808.635474650218
1477.6925262858765
2013.814769317518
625.177607274794
1931.5722640530826
639.386189258312
382.3529411764706
2728.0477408354645
2443.027827613849
540.997442455243
810.9288254007859
1422.9948272215063
1144.3373335956458
621.1180124223603
468.8832054560955
982.4906551249261
2092.5366193908394
480.8184143222506
2036.10359

737.1746652625244
908.8417975886006
1104.2144889220954
1395.0244129272262
841.0717657357661
903.3908093391634
2048.1407807352675
851.7429395439951
1108.2693947144073
3097.2996313149765
1647.64902616565
2966.751918158568
696.2205171923841
1992.192758110109
1135.3531379106826
1967.3421207948063
2242.6331591237627
592.2735226813837
1241.1614262073115
836.1204013377926
963.3418584825234
830.7955669224211
861.6958489081252
1061.1089949393263
1486.2712927664913
860.8321377331421
2407.1009477959983
452.0912449301196
1635.639088800333
886.6155157715259
584.5816587504567
887.3114463176574
1309.219339909877
767.2634271099745
988.1422924901185
1726.3427109974425
1632.4753768297328
1603.7310064690837
1117.4145547547082
1168.1758484827537
1176.470588235294
1023.0179028132992
483.8438192668372
710.4290991759023
1023.0179028132992
819.7258836645026
2131.2872975277064
1463.9738954052386
1977.8346121057118
1473.51114358787
5115.089514066496
895.1406649616368
1416.4863269722605
1835.64649048025
1487.823

1669.9262825334736
818.4143222506394
497.30036942313154
393.8618925831202
841.1480534242683
1332.6154260331134
262.85876669508383
626.8492051552079
644.3522955826484
1096.0906101571063
1309.219339909877
1116.0195303417809
390.6068356196233
924.2036735642873
1158.1334748829802
2145.283811123411
1451.0151886841693
1449.899307853272
1731.683484460978
323.9556692242114
1058.2943822206544
328.1378178835111
1096.0906101571063
823.7986270022883
591.2246321453482
1232.7733720951626
956.0914979563544
376.52742256322813
411.03397880891487
692.0415224913495
511.5089514066496
639.386189258312
790.5138339920949
543.4782608695652
294.05465334559614
1592.4335279640977
1735.3414795870037
319.693094629156
1117.5825829052847
794.4713500571366
726.2164124909223
789.3656657510024
1172.2080136402385
827.4409508048744
898.3375959079283
1432.225063938619
323.45418986008724
1077.9734099892203
1788.3583639405417
2277.074637355772
358.0562659846547
1315.3087321885278
263.5046113306983
738.3879217886313
1381.738

366.58141517476554
1188.9126978641043
1327.9559315364943
995.4515521386893
1279.0892424688227
865.7836337038622
322.061191626409
1132.3480603658656
705.7089600762929
1730.6693844585889
1315.2120702380746
779.7392551930634
639.386189258312
997.4424552429667
664.9616368286445
810.9288254007859
285.23126273353853
342.5283156740957
1687.9795396419436
1534.526854219949
1231.4104385715639
1058.768528449248
1377.6829492306172
1580.5051869306587
1696.5046888320544
1231.4104385715639
1093.5708616280094
971.8670076726343
639.386189258312
2454.6869787612586
852.5149190110827
1621.8576508015717
358.9536501099295
767.2634271099745
666.0577274388016
1203.550473897999
934.0598243077949
495.0086626515964
773.7110189344279
1060.9074547693474
460.35805626598466
945.6468009198564
1672.8858328791246
953.7012692114636
649.222899862286
734.3065119024199
392.68148731064326
582.1779512720419
1109.8779134295228
1616.8386395037774
1031.2680471908259
956.7793335663948
159.9112952831105
1531.0392931876306
460.358

346.6894003978403
1905.5425837827809
1412.739008646937
449.29840326259756
939.5062372775197
478.1496719670855
682.0119352088661
642.8801028608165
528.2652791251433
1453.0643175144453
1049.2940475773394
1322.867977775818
1977.6363528222207
1417.4344436569809
842.0759608191103
1420.8581983518045
1818.6788958461943
3069.0537084398975
1162.5203441060219
616.8196178727245
1652.567381467637
454.6746234725774
948.0553840726694
828.9313287281592
154.82279868469126
949.945195469492
1784.3335514185453
1201.7378978830923
972.978983653953
622.3358908780904
1730.1038062283735
886.2778861006304
716.1125319693095
724.6376811594203
365.36353671903544
652.1739130434783
767.2634271099744
1652.567381467637
1391.6052354445615
1145.7800511508951
795.2029916826065
452.4886877828054
1879.0124745550393
664.9616368286445
1271.8600953895073
294.52341137123744
338.83614663256606
626.8492051552079
639.386189258312
1227.621483375959
831.2020460358056
1393.7182102933991
1153.7623433073754
968.7669534216849
1341.151

933.5038363171356
461.77891446433637
694.9849883242522
682.9591740077674
409.20716112531966
506.39386189258306
433.8691998538546
1784.3335514185453
399.22649865884847
786.4450127877237
386.2548363827136
383.63171355498724
292.9551267147175
961.5015738736965
901.6832213168382
1445.7366211455872
1529.6320317184498
351.66240409207165
517.9028132992328
682.9591740077674
852.5149190110827
860.2650546384561
1420.8581983518045
1000.6949849883241
683.7000835633436
460.35805626598466
805.8017727639001
964.3201542912246
590.1297717154495
259.16453537936917
1272.9265619291193
259.4081110705152
416.14287572066405
333.39422725611985
2251.141304347826
485.93350383631713
1166.8797953964195
682.9591740077674
1326.1343184616842
1055.3845709249065
631.8639987964496
1465.0013656792391
735.2941176470588
1705.0298380221652
383.63171355498724
757.2989670176371
1783.4154379580837
767.2634271099745
210.99744245524298
1455.8331693881564
784.5229316052908
351.66240409207165
678.5322824782086
430.7443801319155
3

279.7314578005115
185.49225710351033
2104.2909917590223
735.3854585312386
161.0305958132045
160.69529004368198
290.16507788886304
310.25952790239404
334.4481605351171
473.14578005115084
414.6632566069906
317.32499763190305
347.82608695652175
161.0305958132045
261.5670774238549
477.4083546462063
361.06514216939973
474.972597734746
277.8022753329218
342.76373032404354
297.6052080911416
354.1215817430651
299.85007496251876
191.40334955861726
229.8916635535504
248.00434007595132
286.4450127877238
138.1668946648427
1179.235501486141
180.7719135084864
315.4305200341006
332.48081841432224
291.8936950961859
572.4028741931555
383.6317135549872
254.7314578005115
292.9551267147175
337.5959079283888
358.53431173363293
326.3074345180351
350.748995250274
151.98358413132695
568.3432793407218
338.87468030690536
524.1462313825785
470.38764354846796
837.5004771538726
976.6624040920716
200.7759625589366
1041.2048877522022
752.2190461862494
1141.0584300609876
2279.808078459555
904.8836926074778
791.240409

Unnamed: 0,0,2,4,8,10,15,16,17,19,22,...,239315,239316,239318,239460,239570,239613,239695,239711,239746,239759
id,254099.0,126147.0,73348.0,116928.0,198249.0,171944.0,32302.0,190533.0,200567.0,11713.0,...,298217.0,245556.0,227908.0,53811.0,166523.0,270122.0,8701.0,70791.0,139143.0,215872.0
precioXmt,1453.324808,625.177607,1913.124031,511.508951,897.861457,1790.28133,582.969761,796.361242,1760.932456,563.971408,...,343.563512,251.937245,262.574595,264.139868,1031.514729,482.555615,250.639386,1483.375959,345.268542,86.757702
metrostotales,80.0,90.0,127.0,68.0,47.0,100.0,68.0,167.0,122.0,39.0,...,60.0,67.0,75.0,61.0,301.0,371.0,200.0,135.0,200.0,283.0
habitaciones,2.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,...,2.0,3.0,2.0,2.0,3.0,2.0,3.0,2.0,2.0,2.0
factor_precioXmt,1.0,0.5,1.0,0.5,0.5,1.0,0.5,0.5,1.0,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0,0.5,0.5


In [612]:
r.head()

Unnamed: 0,id,precioXmt,metrostotales,habitaciones,factor_precioXmt
0,254099,1453.324808,80.0,2.0,1.0
2,126147,625.177607,90.0,2.0,0.5
4,73348,1913.124031,127.0,3.0,1.0
8,116928,511.508951,68.0,2.0,0.5
10,198249,897.861457,47.0,2.0,0.5


In [613]:
test_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59990,59991,59992,59993,59994,59995,59996,59997,59998,59999
id,4941.0,51775.0,115253.0,299321.0,173570.0,30862.0,244471.0,127794.0,71558.0,218011.0,...,205625.0,284266.0,70244.0,59776.0,79100.0,75094.0,171847.0,138313.0,271268.0,72612.0
antiguedad,29.0,,0.0,2.0,10.0,10.0,20.0,0.0,2.0,20.0,...,,5.0,6.0,,,20.0,10.0,5.0,0.0,0.0
habitaciones,3.0,1.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,...,3.0,2.0,3.0,,,4.0,3.0,3.0,2.0,3.0
garages,,1.0,1.0,2.0,1.0,1.0,,1.0,2.0,1.0,...,,1.0,3.0,0.0,,3.0,1.0,2.0,1.0,2.0
banos,4.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,...,3.0,2.0,2.0,,,3.0,2.0,2.0,2.0,2.0
metroscubiertos,300.0,67.0,87.0,86.0,80.0,165.0,220.0,62.0,260.0,123.0,...,316.0,68.0,270.0,,199.0,291.0,71.0,102.0,130.0,211.0
metrostotales,,67.0,100.0,86.0,76.0,138.0,190.0,62.0,200.0,160.0,...,,68.0,170.0,200.0,199.0,,87.0,,144.0,130.0
idzona,,113851.0,23620.0,129347.0,57125.0,48216.0,323485.0,54688.0,107969.0,50002836.0,...,55552.0,50003995.0,24940.0,49037.0,24597.0,275741.0,57474.0,72224.0,83960.0,47747.0
lat,19.4087,21.0325,19.3328,16.8605,19.6405,,,,19.17,21.158,...,,,19.2812,20.5312,19.4409,19.4348,,,20.5918,20.656
lng,-99.2468,-89.5924,-99.1529,-99.8784,-99.1273,,,,-96.1527,-86.8385,...,,,-99.1422,-103.487,-99.1885,-99.0925,,,-100.328,-103.427


In [614]:
train_set = train_set.merge(r[['id','factor_precioXmt']], on = 'id', how = 'left')
test_set = test_set.merge(r[['id','factor_precioXmt']], on = 'id', how = 'left')

In [615]:
test_set.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59990,59991,59992,59993,59994,59995,59996,59997,59998,59999
id,4941.0,51775.0,115253.0,299321.0,173570.0,30862.0,244471.0,127794.0,71558.0,218011.0,...,205625.0,284266.0,70244.0,59776.0,79100.0,75094.0,171847.0,138313.0,271268.0,72612.0
antiguedad,29.0,,0.0,2.0,10.0,10.0,20.0,0.0,2.0,20.0,...,,5.0,6.0,,,20.0,10.0,5.0,0.0,0.0
habitaciones,3.0,1.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,...,3.0,2.0,3.0,,,4.0,3.0,3.0,2.0,3.0
garages,,1.0,1.0,2.0,1.0,1.0,,1.0,2.0,1.0,...,,1.0,3.0,0.0,,3.0,1.0,2.0,1.0,2.0
banos,4.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,...,3.0,2.0,2.0,,,3.0,2.0,2.0,2.0,2.0
metroscubiertos,300.0,67.0,87.0,86.0,80.0,165.0,220.0,62.0,260.0,123.0,...,316.0,68.0,270.0,,199.0,291.0,71.0,102.0,130.0,211.0
metrostotales,,67.0,100.0,86.0,76.0,138.0,190.0,62.0,200.0,160.0,...,,68.0,170.0,200.0,199.0,,87.0,,144.0,130.0
idzona,,113851.0,23620.0,129347.0,57125.0,48216.0,323485.0,54688.0,107969.0,50002836.0,...,55552.0,50003995.0,24940.0,49037.0,24597.0,275741.0,57474.0,72224.0,83960.0,47747.0
lat,19.4087,21.0325,19.3328,16.8605,19.6405,,,,19.17,21.158,...,,,19.2812,20.5312,19.4409,19.4348,,,20.5918,20.656
lng,-99.2468,-89.5924,-99.1529,-99.8784,-99.1273,,,,-96.1527,-86.8385,...,,,-99.1422,-103.487,-99.1885,-99.0925,,,-100.328,-103.427


In [616]:
#train_set = train_set.dropna(0)
train_set.replace(0, np.nan, inplace=True)
train_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,anio_publ,c_hab_ant,antiguedad_prov,factor_precioXmt
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,,,,,,,,2273000.0,2015,,12.189608,1.0
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,19.310205,-99.227655,,,,1.0,1.0,3600000.0,2013,28844.0,12.189608,
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,,,,,,,,1200000.0,2015,33268.0,6.635517,
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,19.30189,-99.688015,,,,1.0,1.0,650000.0,2012,12353.0,10.027958,
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,,,,,,,,1150000.0,2016,28844.0,6.635517,0.5


In [617]:
#test_set = test_set.dropna(0)
test_set.replace(0, np.nan, inplace=True)
test_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,anio_publ,c_hab_ant,antiguedad_prov,factor_precioXmt
0,4941.0,29.0,3.0,,4.0,300.0,,,19.408668,-99.246767,,,,,,2013,301.0,,
1,51775.0,,1.0,1.0,1.0,67.0,67.0,113851.0,21.03248,-89.592424,,,,,,2015,,,
2,115253.0,,2.0,1.0,2.0,87.0,100.0,23620.0,19.332829,-99.152913,,,,,1.0,2015,50335.0,,
3,299321.0,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,16.860487,-99.878383,,,,,,2015,5059.0,,
4,173570.0,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,19.640482,-99.127273,,,,1.0,1.0,2013,28844.0,,


# Se exporta para entrenar

## Se sacan las columnas 'id' de train_set

In [618]:
#test_set = test_set.loc[:,test_set.columns != 'id']
train_set = train_set.loc[:,train_set.columns != 'id']

## NO hace falta eliminar NULLs

In [619]:
len(train_set)

240000

In [620]:
train_set.to_csv("train_set_xgb.csv", index = False)

In [621]:
test_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lng,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,anio_publ,c_hab_ant,antiguedad_prov,factor_precioXmt
0,4941.0,29.0,3.0,,4.0,300.0,,,19.408668,-99.246767,,,,,,2013,301.0,,
1,51775.0,,1.0,1.0,1.0,67.0,67.0,113851.0,21.03248,-89.592424,,,,,,2015,,,
2,115253.0,,2.0,1.0,2.0,87.0,100.0,23620.0,19.332829,-99.152913,,,,,1.0,2015,50335.0,,
3,299321.0,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,16.860487,-99.878383,,,,,,2015,5059.0,,
4,173570.0,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,19.640482,-99.127273,,,,1.0,1.0,2013,28844.0,,


In [622]:
len(test_set)

60000

In [623]:
test_set.to_csv("test_set_xgb.csv", index = False)