# Variables Categoricas

In [12]:
# importamos las librerias
import pandas as pd
from sklearn.model_selection import train_test_split

# importamos los datos
data = pd.read_csv("data/melb_data.csv")

# asignamos el target y los predictores
y = data.Price
X = data.drop(["Price"], axis = 1)

# asignamos los conjuntos de entrenamiento y validacion
X_train_full,X_valid_full,y_train,y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

# eliminamos los datos faltantes (missing data)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing,axis = 1, inplace = True)
X_valid_full.drop(cols_with_missing,axis = 1, inplace = True)

# ahora filtramos las columnas con la cardinalidad mas baja relativa
# la cardinalidad es la propiedad que indica cuantos valores unicos contiene una columna
low_cardinality_columns = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10
                           and
                           X_train_full[cname].dtype == "object"]

# ahora seleccionamos las columnas numericas
numeric_col = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64','float64']]

# ahora hacemos una copia de los datos completo que seran las que utilizaremos
my_col = low_cardinality_columns + numeric_col
X_train = X_train_full[my_col].copy()
X_valid = X_valid_full[my_col].copy()

In [14]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [38]:
# Hacemos una lista de las columnas categoricas
s = (X_train.dtypes == "object")
object_cols = list(s[s].index)

print("columnas categoricas")
print(object_list)

columnas categoricas
['Type', 'Method', 'Regionname']


In [34]:
# definimos una funcion para medir la calidad de cada aproximacion
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# funcion para comparar diferentes aproximaciones
def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators = 100, random_state = 0)
    model.fit(X_train,y_train)
    predict = model.predict(X_valid)
    return mean_absolute_error(y_valid,predict)

In [36]:
# aprioximacion 1
drop_X_train = X_train.select_dtypes(exclude = ["object"])
drop_X_valid = X_valid.select_dtypes(exclude = ["object"])

print('MAE de aproximacion 1 (eliminamos columnas categoricas): ')
print(score_dataset(drop_X_train,drop_X_valid,y_train,y_valid))

MAE de aproximacion 1 (eliminamos columnas categoricas): 
175703.48185157913


In [48]:
# aproximacion 2
from sklearn.preprocessing import OrdinalEncoder

# hacemos una copia de los conjuntos para evitar cambiarlos
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# comenzamos con el ordinal encoding
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from aproach 2 (ordinal encoding): ")
print(score_dataset(label_X_train,label_X_valid,y_train,y_valid))

MAE from aproach 2 (ordinal encoding): 
165936.40548390493


In [54]:
# puntuacion de la aproximacion 3 (OneHotEncoder)
from sklearn.preprocessing import OneHotEncoder

# aplicamos el OHE
OH_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# el one hot encoding remueve los incides, pongamolos de nuevo
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# removemos las columnas categoricas, ya que las remplazaremos con el OHE
num_X_train = X_train.drop(object_cols,axis = 1)
num_X_valid = X_valid.drop(object_cols,axis = 1)

# anadimos las columnas de OHE a num_X_train y num_X_valid
OH_X_train = pd.concat([num_X_train, OH_cols_train],axis = 1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid],axis = 1)

# nos aseguramosde que las columnas sean de tipo string
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print("MAE de la aproximacion 3 (OneHotEncoder): ")
print(score_dataset(OH_X_train,OH_X_valid,y_train,y_valid))

MAE de la aproximacion 3 (OneHotEncoder): 
166089.4893009678
