[Reference](https://towardsdatascience.com/using-pipelines-in-sci-kit-learn-516aa431dcc5)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn. preprocessing import MinMaxScaler
from sklearn. preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Column Transformers

In [3]:
def make_coltrans():
    column_trans = ColumnTransformer(transformers=
            [('num', MinMaxScaler(), selector(dtype_exclude="object")),
             ('cat', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['CAT_FIELD_ONE', 'CAT_FIELD_TWO'])],
            remainder='drop')

    return column_trans

# The Pipeline

In [4]:
def create_pipe(clf):
    '''Create a pipeline for a given classifier.  
       The classifier needs to be an instance
       of the classifier with all parameters needed specified.'''

    # Each pipeline uses the same column transformer.  
    column_trans = make_coltrans()

    pipeline = Pipeline([('prep',column_trans),
                         ('clf', clf)])

    return pipeline

# Creating and Fitting the Model

In [6]:
np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


model score: 0.790


In [8]:
from sklearn.ensemble import RandomForestClassifier

# Create the classifier instance and build the pipeline.
clf = RandomForestClassifier(random_state=42, class_weight='balanced')
pipeline = create_pipe(clf)

# Fit the model to the training data
pipeline.fit(X_train, y_train)