In [113]:
# Importamos librerías de análisis de datos
%matplotlib inline
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('mode.chained_assignment', None) # Deshabilita SettingWithCopyWarning. Ojo.

In [114]:
one_hot_encoding_example_df = pd.DataFrame(['red', 'red', 'green', 'blue', np.nan], columns=['color'])
display(one_hot_encoding_example_df)
display(pd.get_dummies(one_hot_encoding_example_df))
display(pd.get_dummies(one_hot_encoding_example_df, dummy_na=True))

Unnamed: 0,color
0,red
1,red
2,green
3,blue
4,


Unnamed: 0,color_blue,color_green,color_red
0,0,0,1
1,0,0,1
2,0,1,0
3,1,0,0
4,0,0,0


Unnamed: 0,color_blue,color_green,color_red,color_nan
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,0,0,1


In [115]:
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [116]:
df = pd.read_csv('/home/alvaro/Escritorio/TP2/train.csv', index_col='id', parse_dates=['fecha'])

In [117]:
# Droppeamos strings y columnas complejas y repetidas
drop_cols = ['titulo', 'descripcion', 'direccion', 'lat', 'lng', 'fecha', 'idzona', 'ciudad']
df2 = df.drop(drop_cols, axis=1).copy()
print(f"Columnas ({len(df2.columns)}): {df2.columns.tolist()}")
df2.head()

Columnas (14): ['tipodepropiedad', 'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos', 'metrostotales', 'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio']


Unnamed: 0_level_0,tipodepropiedad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
254099,Apartamento,Distrito Federal,,2.0,1.0,2.0,80.0,80.0,0.0,0.0,0.0,0.0,0.0,2273000.0
53461,Casa en condominio,Distrito Federal,10.0,3.0,2.0,2.0,268.0,180.0,0.0,0.0,0.0,1.0,1.0,3600000.0
247984,Casa,Jalisco,5.0,3.0,2.0,2.0,144.0,166.0,0.0,0.0,0.0,0.0,0.0,1200000.0
209067,Casa,Edo. de México,1.0,2.0,1.0,1.0,63.0,67.0,0.0,0.0,0.0,1.0,1.0,650000.0
185997,Apartamento,Jalisco,10.0,2.0,1.0,1.0,95.0,95.0,0.0,0.0,0.0,0.0,0.0,1150000.0


In [118]:
display(df2.isnull().sum())
numeric_columns_with_nulls = list(set(df2.columns[df2.isnull().sum() > 0].tolist()) \
                                  - set(['tipodepropiedad', 'ciudad', 'provincia']))
print(numeric_columns_with_nulls)

tipodepropiedad                  46
provincia                       155
antiguedad                    43555
habitaciones                  22471
garages                       37765
banos                         26221
metroscubiertos               17400
metrostotales                 51467
gimnasio                          0
usosmultiples                     0
piscina                           0
escuelascercanas                  0
centroscomercialescercanos        0
precio                            0
dtype: int64

['metroscubiertos', 'antiguedad', 'habitaciones', 'garages', 'metrostotales', 'banos']


In [119]:
# Con dummy_na=True, creamos la categoria "Es nulo" como una coordenada más de los one-hot vectors
# Comentar: ¿Hay leaks acá? ¿Sí / No? ¿Por qué?
df2 = pd.get_dummies(df2, dummy_na=True)
print(f"Cantidad de columnas después del one-hot-encoding: {len(df2.columns)}")
df2.head(10)

Cantidad de columnas después del one-hot-encoding: 70


Unnamed: 0_level_0,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,tipodepropiedad_Apartamento,tipodepropiedad_Bodega comercial,tipodepropiedad_Casa,tipodepropiedad_Casa en condominio,tipodepropiedad_Casa uso de suelo,tipodepropiedad_Departamento Compartido,tipodepropiedad_Duplex,tipodepropiedad_Edificio,tipodepropiedad_Garage,tipodepropiedad_Hospedaje,tipodepropiedad_Huerta,tipodepropiedad_Inmuebles productivos urbanos,tipodepropiedad_Local Comercial,tipodepropiedad_Local en centro comercial,tipodepropiedad_Lote,tipodepropiedad_Nave industrial,tipodepropiedad_Oficina comercial,tipodepropiedad_Otros,tipodepropiedad_Quinta Vacacional,tipodepropiedad_Rancho,tipodepropiedad_Terreno,tipodepropiedad_Terreno comercial,tipodepropiedad_Terreno industrial,tipodepropiedad_Villa,tipodepropiedad_nan,provincia_Aguascalientes,provincia_Baja California Norte,provincia_Baja California Sur,provincia_Campeche,provincia_Chiapas,provincia_Chihuahua,provincia_Coahuila,provincia_Colima,provincia_Distrito Federal,provincia_Durango,provincia_Edo. de México,provincia_Guanajuato,provincia_Guerrero,provincia_Hidalgo,provincia_Jalisco,provincia_Michoacán,provincia_Morelos,provincia_Nayarit,provincia_Nuevo León,provincia_Oaxaca,provincia_Puebla,provincia_Querétaro,provincia_Quintana Roo,provincia_San luis Potosí,provincia_Sinaloa,provincia_Sonora,provincia_Tabasco,provincia_Tamaulipas,provincia_Tlaxcala,provincia_Veracruz,provincia_Yucatán,provincia_Zacatecas,provincia_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
254099,,2.0,1.0,2.0,80.0,80.0,0.0,0.0,0.0,0.0,0.0,2273000.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
53461,10.0,3.0,2.0,2.0,268.0,180.0,0.0,0.0,0.0,1.0,1.0,3600000.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
247984,5.0,3.0,2.0,2.0,144.0,166.0,0.0,0.0,0.0,0.0,0.0,1200000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
209067,1.0,2.0,1.0,1.0,63.0,67.0,0.0,0.0,0.0,1.0,1.0,650000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185997,10.0,2.0,1.0,1.0,95.0,95.0,0.0,0.0,0.0,0.0,0.0,1150000.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126147,5.0,2.0,1.0,1.0,75.0,90.0,0.0,0.0,0.0,0.0,1.0,1100000.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
139233,,3.0,1.0,2.0,140.0,160.0,0.0,0.0,0.0,0.0,0.0,1150000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5013,2.0,4.0,2.0,3.0,293.0,293.0,0.0,0.0,0.0,0.0,0.0,4200000.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
44962,1.0,2.0,1.0,1.0,58.0,,0.0,0.0,0.0,1.0,1.0,310000.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
134537,,,,,250.0,,0.0,0.0,0.0,0.0,0.0,6200000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [120]:
# Para los nulls numéricos, usar un Imputer con strategy mean (reemplazamos los NaN por el promedio)
# Para no leakear, spliteamos el dataset antes
X = df2.drop("precio", axis=1)
y = df2['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
display(X_train)

Unnamed: 0_level_0,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,tipodepropiedad_Apartamento,tipodepropiedad_Bodega comercial,tipodepropiedad_Casa,tipodepropiedad_Casa en condominio,tipodepropiedad_Casa uso de suelo,tipodepropiedad_Departamento Compartido,tipodepropiedad_Duplex,tipodepropiedad_Edificio,tipodepropiedad_Garage,tipodepropiedad_Hospedaje,tipodepropiedad_Huerta,tipodepropiedad_Inmuebles productivos urbanos,tipodepropiedad_Local Comercial,tipodepropiedad_Local en centro comercial,tipodepropiedad_Lote,tipodepropiedad_Nave industrial,tipodepropiedad_Oficina comercial,tipodepropiedad_Otros,tipodepropiedad_Quinta Vacacional,tipodepropiedad_Rancho,tipodepropiedad_Terreno,tipodepropiedad_Terreno comercial,tipodepropiedad_Terreno industrial,tipodepropiedad_Villa,tipodepropiedad_nan,provincia_Aguascalientes,provincia_Baja California Norte,provincia_Baja California Sur,provincia_Campeche,provincia_Chiapas,provincia_Chihuahua,provincia_Coahuila,provincia_Colima,provincia_Distrito Federal,provincia_Durango,provincia_Edo. de México,provincia_Guanajuato,provincia_Guerrero,provincia_Hidalgo,provincia_Jalisco,provincia_Michoacán,provincia_Morelos,provincia_Nayarit,provincia_Nuevo León,provincia_Oaxaca,provincia_Puebla,provincia_Querétaro,provincia_Quintana Roo,provincia_San luis Potosí,provincia_Sinaloa,provincia_Sonora,provincia_Tabasco,provincia_Tamaulipas,provincia_Tlaxcala,provincia_Veracruz,provincia_Yucatán,provincia_Zacatecas,provincia_nan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1
267694,20.00,2.00,3.00,2.00,315.00,315.00,0.00,0.00,0.00,0.00,0.00,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
224166,2.00,2.00,2.00,2.00,85.00,,0.00,0.00,0.00,1.00,1.00,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
147987,,4.00,3.00,4.00,130.00,,0.00,0.00,0.00,0.00,0.00,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
144982,4.00,4.00,3.00,4.00,408.00,,0.00,1.00,1.00,1.00,1.00,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
251965,0.00,3.00,1.00,1.00,194.00,120.00,0.00,0.00,0.00,1.00,1.00,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188691,0.00,4.00,,,,377.00,0.00,1.00,0.00,1.00,1.00,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
206925,5.00,2.00,1.00,1.00,72.00,80.00,0.00,0.00,0.00,0.00,0.00,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
272675,10.00,3.00,3.00,3.00,,300.00,0.00,0.00,0.00,0.00,0.00,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
105474,20.00,2.00,,2.00,135.00,154.00,0.00,0.00,0.00,1.00,1.00,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [121]:
numeric_columns_with_nulls = list(set(df2.columns[df2.isnull().sum() > 0].tolist()) \
                                  - set(['tipodepropiedad', 'provincia', 'ciudad']))
print(numeric_columns_with_nulls)

['metroscubiertos', 'antiguedad', 'habitaciones', 'garages', 'metrostotales', 'banos']


In [122]:
for c in numeric_columns_with_nulls:
    imp = Imputer()
    X_train[c] = imp.fit_transform(X_train[[c]])
    X_test[c] = imp.transform(X_test[[c]])
    
print ("Finished")

Finished




In [123]:
#model.score(X_test, y_test)
pd.to_numeric(y_train, downcast='integer')
pd.to_numeric(y_test, downcast='integer')

id
245130    2300000
179086    2262945
87774     3800000
260997    1000000
109591     376200
           ...   
266839    2150000
169155    3300000
23033     9980000
19715      565000
202182    1910000
Name: precio, Length: 60000, dtype: int32

In [None]:
tree = DecisionTreeRegressor(min_samples_split=100, min_samples_leaf = 11, min_weight_fraction_leaf = 0 ,
                            min_impurity_decrease = 10, presort=True)
tree.fit(X_train, y_train)
tree.score(X_test, y_test)