See: https://towardsdatascience.com/creating-custom-transformers-for-sklearn-pipelines-d3d51852ecc1

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression


from app.src.datasource import load_data


In [3]:
# The ColumnsSelector class inherits from the sklearn.base classes
# (BaseEstimator, TransformerMixin). This makes it compatible with
# scikit-learn’s Pipelines

class DataFrameColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return the dataframe with the specified features
        return X[self.columns]


In [4]:
train_df, test_df = load_data()


In [5]:

df = train_df.copy()

# this is a bit weird (potentially) using the dataset like this... have a think

df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=0)
X_train


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
231,3,male,29.0,7.7750,S
836,3,male,21.0,8.6625,S
639,3,male,,16.1000,S
389,2,female,17.0,12.0000,C
597,3,male,49.0,0.0000,S
...,...,...,...,...,...
131,3,male,20.0,7.0500,S
490,3,male,,19.9667,S
838,3,male,32.0,56.4958,S
48,3,male,,21.6792,C


In [6]:
numeric_transformer1 = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(['Age', 'Fare'])),
])


In [7]:
# Recall that the `.fit()`` function doesn’t do anything useful here.
# This calling `.fit()`` doesn’t really do anything internally within the class. i.e. just returns the object

numeric_transformer1.fit(X_train)


In [8]:
numeric_transformer1.transform(X_train)


Unnamed: 0,Age,Fare
231,29.0,7.7750
836,21.0,8.6625
639,,16.1000
389,17.0,12.0000
597,49.0,0.0000
...,...,...
131,20.0,7.0500
490,,19.9667
838,32.0,56.4958
48,,21.6792


In [9]:
# Alternatively can just ca;; the `.fit_transform` method to do both `fit` and `transform` methods

numeric_transformer1.fit_transform(X_train, y_train)


Unnamed: 0,Age,Fare
231,29.0,7.7750
836,21.0,8.6625
639,,16.1000
389,17.0,12.0000
597,49.0,0.0000
...,...,...
131,20.0,7.0500
490,,19.9667
838,32.0,56.4958
48,,21.6792


In [10]:

numeric_transformer2 = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(['Age', 'Fare'])),
    ('imputer', SimpleImputer(strategy='median')),
])


In [11]:
numeric_transformer2.fit_transform(X_train)


array([[29.    ,  7.775 ],
       [21.    ,  8.6625],
       [28.75  , 16.1   ],
       ...,
       [32.    , 56.4958],
       [28.75  , 21.6792],
       [22.    ,  9.    ]])

In [12]:

numeric_transformer3 = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(['Age', 'Fare'])),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

numeric_transformer3.fit_transform(X_train)


array([[-0.02863633, -0.47911875],
       [-0.65142052, -0.46270324],
       [-0.04809833, -0.32513665],
       ...,
       [ 0.20490775,  0.42203815],
       [-0.04809833, -0.22194182],
       [-0.5735725 , -0.45646073]])

In [13]:
numeric_transformer3.fit_transform(X_test)


array([[-0.64367628, -0.56462969],
       [-1.91470853,  0.27428807],
       [ 3.05727055, -0.56418161],
       [-0.41937648,  1.45910074],
       [-0.08292676, -0.57815521],
       [-0.79320949,  0.0328088 ],
       [-0.15769337, -0.56418161],
       [ 2.19745462, -0.40990319],
       [-0.34460987,  3.29412049],
       [ 1.52455519,  5.86911589],
       [-1.0175093 ,  0.29273441],
       [ 1.07595558, -0.5561323 ],
       [-1.54087551,  0.15019456],
       [-0.79320949, -0.4233187 ],
       [-2.13900834, -0.34975607],
       [-0.49414308, -0.56026965],
       [-0.86797609,  5.33260798],
       [-1.83994193,  0.07003686],
       [-0.15769337, -0.58296333],
       [-0.49414308, -0.4233187 ],
       [ 0.10398974,  3.65142126],
       [-0.15769337, -0.56026965],
       [ 0.10398974, -0.06780756],
       [-0.86797609, -0.4233187 ],
       [-2.13900834, -0.47340419],
       [-0.49414308, -0.4233187 ],
       [-0.34460987, -0.07451532],
       [-2.13900834, -0.22007365],
       [ 1.15072218,

In [14]:
numeric_transformer4 = Pipeline(steps=[
    ColumnTransformer(transformers=[
        ('columns selector', DataFrameColumnSelector(['Age', 'Fare'])),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]),
    ('classifier', LogisticRegression())
])


In [15]:

numeric_transformer4.fit_transform(X_train, y_train)


TypeError: cannot unpack non-iterable ColumnTransformer object

In [16]:
numeric_features = ['Age']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# handle categorical features
categorical_features = ['Sex', 'Pclass']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder(handle_unknown='ignore'))])


In [17]:
# Create a transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [18]:
# Run the classifier

classifier = LogisticRegression()

# Set the Pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)])


In [19]:
pipe.fit(X_train, y_train)


In [20]:
pipe.predict(X_test)


array([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0])

In [21]:
pipe.score(X_test, y_test)


0.8171641791044776