Learning Pipelines:

Pipeline bundles the preprocessing and modeling steps and makes the code short, simple and easier to productionize. It does the imputation, Encoder etc by itself such that the code is less redundant and makes it faster to build the models.

In [107]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

url = r'C:\Users\12368\OneDrive\Desktop\DataScience\melb_data.csv'
data = pd.read_csv(url)

y = data['Price']
X = data.drop('Price', axis=1)

X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

numerical_cols = [cols for cols in X_train_full.columns if X_train_full[cols].dtype in ['int64', 'float64']]
low_cardinality_cols = [cols for cols in X_train_full.columns if X_train_full[cols].nunique() < 10 and X_train_full[cols].dtype == 'object']

In [108]:
my_cols = numerical_cols + low_cardinality_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()


In [109]:
numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

model = RandomForestRegressor(n_estimators=100, random_state=0)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [110]:
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 160679.18917034855
