See: https://towardsdatascience.com/creating-custom-transformers-for-sklearn-pipelines-d3d51852ecc1

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import set_config


from app.src.datasource import load_data


In [3]:
set_config("diagram")

In [4]:
# The ColumnsSelector class inherits from the sklearn.base classes
# (BaseEstimator, TransformerMixin). This makes it compatible with
# scikit-learn’s Pipelines

class DataFrameColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return the dataframe with the specified features
        return X[self.columns]


In [5]:
train_df, test_df = load_data()


In [6]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
X_train = train_df.copy()
y_train = X_train["Survived"]
X_train.drop("Survived", axis="columns")    # this is our target / variable to  be predicted


# Testing data

X_test = test_df.copy()
# X_test does not contain our target / variable to  be predicted

# In this case the train/test split has already be done for us (Kaggle)


In [8]:
num_column = ["Age", "Pclass", "Parch", "SibSp"] 

In [9]:
cat_column = ["Embarked", "Sex"]

In [10]:
num_transform = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(num_column)),
])


In [11]:
# Recall that the `.fit()`` function doesn’t do anything useful here.
# This calling `.fit()`` doesn’t really do anything internally within the class. i.e. just returns the object

num_transform.fit(X_train)


In [12]:
num_transform.transform(X_train)


Unnamed: 0,Age,Pclass,Parch,SibSp
0,22.0,3,0,1
1,38.0,1,0,1
2,26.0,3,0,0
3,35.0,1,0,1
4,35.0,3,0,0
...,...,...,...,...
886,27.0,2,0,0
887,19.0,1,0,0
888,,3,2,1
889,26.0,1,0,0


In [13]:
# Alternatively can just ca;; the `.fit_transform` method to do both `fit` and `transform` methods

num_transform.fit_transform(X_train, y_train)


Unnamed: 0,Age,Pclass,Parch,SibSp
0,22.0,3,0,1
1,38.0,1,0,1
2,26.0,3,0,0
3,35.0,1,0,1
4,35.0,3,0,0
...,...,...,...,...
886,27.0,2,0,0
887,19.0,1,0,0
888,,3,2,1
889,26.0,1,0,0


In [14]:
num_transform

In [15]:

num_transform = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(num_column)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [16]:
num_transform.fit_transform(X_train)


array([[-0.56573646,  0.82737724, -0.47367361,  0.43279337],
       [ 0.66386103, -1.56610693, -0.47367361,  0.43279337],
       [-0.25833709,  0.82737724, -0.47367361, -0.4745452 ],
       ...,
       [-0.1046374 ,  0.82737724,  2.00893337,  0.43279337],
       [-0.25833709, -1.56610693, -0.47367361, -0.4745452 ],
       [ 0.20276197,  0.82737724, -0.47367361, -0.4745452 ]])

In [17]:
num_transform = Pipeline(steps=[
    ColumnTransformer(transformers=[
        ('columns selector', DataFrameColumnSelector(num_column)),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]),
        ('classifier', LogisticRegression())
])


In [18]:
numeric_features = num_column

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# handle categorical features
categorical_features = cat_column

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder(handle_unknown='ignore'))])


In [19]:
# Create a transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [20]:
# Run the classifier

classifier = LogisticRegression()

# Set the Pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)])


In [21]:
pipe.fit(X_train, y_train)


In [22]:
pipe.score(X_train, y_train)

0.7968574635241302

In [23]:
pipe.predict(X_test)


array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [24]:

from app.src.openerror.ImputeByGroup import ImputeNumericalByGroup, ImputeCategoricalByGroup
from app.src.openerror.PandasColumnTransformer import PandasColumnTransformer

In [29]:
pl = make_pipeline(
    ImputeNumericalByGroup(target_col="Age", groupby_col=["Pclass", "Embarked"], return_df=True),
    ImputeCategoricalByGroup(target_col="Embarked", groupby_col=["Pclass"], return_df=True, copy=False)
)

In [32]:
pipe = Pipeline(steps=[
    ImputeNumericalByGroup(target_col="Age", groupby_col=["Pclass", "Sex"], return_df=True, copy=False)
])

In [36]:
pipe.fit(X_train)

TypeError: cannot unpack non-iterable ImputeNumericalByGroup object