In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# read data
data = pd.read_csv('./csv/melb_data.csv')

y = data.Price
X = data.drop(['Price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [3]:
# low cardinality categorical columns
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object' and X_train[col].nunique() < 10]

# numerical columns
numerical_dtype = ['int64', 'float64']
numerical_columns = [col for col in X_train.columns if X_train[col].dtype in numerical_dtype]

total_columns = categorical_columns + numerical_columns
X_train = X_train[total_columns].copy()
X_test = X_test[total_columns].copy()

Preprocessing

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# preprocessing for numerical data (imputation for missing values)
numerical_transformer = SimpleImputer(strategy='constant')

# preprocessing for categorical data, missing values + one hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cate', categorical_transformer, categorical_columns)
    ])

model

In [7]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

Pipelining

Bundle Preprocessing and modeling in a pipeline

In [8]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

In [9]:
from sklearn.metrics import mean_absolute_error
print('MAE:', mean_absolute_error(y_test, y_pred))

MAE: 166599.1471162073
