# Pipeline

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# leemos los datos
data = pd.read_csv('data/melb_data.csv')

# definimos el target y los predictores
y = data.Price
X = data.drop(['Price'],axis = 1)

# separamos los datos
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X,y,train_size = 0.8,test_size = 0.2, random_state = 0)

# utilizaremos solo columnas que seleccionemos
# para las columnas categoricas utilizaremos columnas con cardinalidad baja relativa
categoric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
                  X_train_full[cname].dtype == 'object']
# separamos las columnas categoricas
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64','float64']]

# ahora hacemos un conjunto de los datos seleccionados
my_cols = categoric_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [28]:
# definimos los pasos que hara
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# las columnas numericas seran imputadas
numeric_transformer = SimpleImputer(strategy = 'constant')

# las columnas categoricas seran imputadas y codeadas con OneHotEncoder
categoric_transformer = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('one', OneHotEncoder(handle_unknown = 'ignore'))
])

# empacamos los preceso que llevamos
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_cols),
        ('cat', categoric_transformer, categoric_cols)
    ])

In [30]:
# elejimos el modelo
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100,random_state = 0)

In [32]:
from sklearn.metrics import mean_absolute_error
# Construimos y validamos el pipeline
my_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

# ajustamos el pipeline
my_pipeline.fit(X_train,y_train)
# validamos el pipeline
preds = my_pipeline.predict(X_valid)
# vemos el puntaje
score = mean_absolute_error(y_valid,preds)
# imprimimos el mae
print("MAE", score)

MAE 160679.18917034855
