In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# cargamos la informacion
data = pd.read_csv("data/melb_data.csv")

# seleccionamos el objetivo (target)
y = data.Price

# para mantener las cosas simples, solo usaremos atributos numericos
melb_predictors = data.drop(['Price'],axis = 1)
X = melb_predictors.select_dtypes(exclude = ["object"])

# dividimos entre conjunto de datos y de validacion
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size = 0.8, random_state = 0)

In [15]:
# definimos una funcion para medir la calidad de cada aproximacion
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# funcion para comparar diferentes aproximaciones
def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=10,random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)
# esta funcion nos servira para hacer una prediccion con los datos que mejor creamos utiles
# ya que pueden ser disintas configuraciones

# Puntaje por aproximacion 1 (eliminar columnas con valores perdidos)

In [18]:
# vemos los nombres de las columnas que tienen los datos faltantes/perdidos (missing data)
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# eliminamos las columnas en los conjuntos de entranamiento y validacion
reduced_X_train = X_train.drop(cols_with_missing, axis = 1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis = 1)

print("MAE en aproximacion 1 (sin columnas con datos faltantes): ")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE en aproximacion 1 (sin columnas con datos faltantes): 
183550.22137772635


# Puntaje por aproximacion 2 (Imputacion)

In [21]:
from sklearn.impute import SimpleImputer

# imputacion
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# nombre de columnas removidas en imputacion
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE en aproximaciona 2 (imputacion): ")
print(score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid))

MAE en aproximaciona 2 (imputacion): 
178166.46269899711


In [23]:
imputed_X_train

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.000000,-37.85984,144.98670,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.85800,144.90050,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.79880,144.82200,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.000000,-37.70830,144.91580,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.000000,1970.000000,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10859,3.0,5.2,3056.0,3.0,1.0,2.0,212.0,153.764119,1964.839866,-37.77695,144.95785,11918.0
10860,3.0,10.5,3081.0,3.0,1.0,1.0,748.0,101.000000,1950.000000,-37.74160,145.04810,2947.0
10861,4.0,6.7,3058.0,4.0,2.0,2.0,441.0,255.000000,2002.000000,-37.73572,144.97256,11204.0
10862,3.0,12.0,3073.0,3.0,1.0,1.0,606.0,153.764119,1964.839866,-37.72057,145.02615,21650.0


# Valores perdidos

In [25]:
# hacemos una copia de la informacion original (cuando imputamos)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# hacemos una columna indicanto que sera imputada
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# imputamos
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE por aprocimacion 3 (una extencion de imputacion): ")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE por aprocimacion 3 (una extencion de imputacion): 
178927.503183954


In [20]:
# hacemos una copia de la informacion original (cuando imputamos)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# hacemos una columna indicanto que sera imputada
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

In [24]:
X_valid_plus.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Car_was_missing,BuildingArea_was_missing,YearBuilt_was_missing
8505,4,8.0,3016.0,4.0,2.0,2.0,450.0,190.0,1910.0,-37.861,144.8985,6380.0,False,False,False
5523,2,6.6,3011.0,2.0,1.0,0.0,172.0,81.0,1900.0,-37.81,144.8896,2417.0,False,False,False
12852,3,10.5,3020.0,3.0,1.0,1.0,581.0,,,-37.7674,144.82421,4217.0,False,True,True
4818,3,4.5,3181.0,2.0,2.0,1.0,128.0,134.0,2000.0,-37.8526,145.0071,7717.0,False,False,False
12812,3,8.5,3044.0,3.0,2.0,2.0,480.0,,,-37.72523,144.94567,7485.0,False,True,True
