See: https://towardsdatascience.com/creating-custom-transformers-for-sklearn-pipelines-d3d51852ecc1

and https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from app.src.datasource import load_data

In [3]:
# DataFrameColumnSelector class inherits from the sklearn.base classes
# (BaseEstimator, TransformerMixin)

class DataFrameColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return the dataframe with the specified features
        return X[self.columns]


In [4]:
class GroupbyImputer(TransformerMixin, BaseEstimator):
    """
    Documentation:
        ---
        Description:
            Impute numeric columns, as specified by the strategy parameter. GroupbyImputer utilizes one or
            more additional context columns as a groupby value to add more subtlety to fill_value identification.
            Imputes training data features, and stores impute values to be used on validation and unseen data.
        ---
        Parameters:
            null_column : list
                Column with nulls to impute.
            groupby_column : list
                List of one or most columns to groupby to add context to null column.
            strategy : str, default='mean'
                Imputing stategy. Accepts values 'mean', 'median' and 'most_frequent'.
            train : bool, default=True
                Tells class whether we are imputing training data or unseen data.
            train_value : dict, default=None
                Only used when train=False. Value is a dictionary containing 'feature : value' pairs.
                Dictionary is retrieved from training data pipeline using named steps. The attribute is
                called train_value_.
        ---
        Returns:
            X : array
                Dataset where each column with missing values has been imputed with a value learned from a particular
                strategy while also consider select columns as a group by variable.
    """

    def __init__(self, null_column, groupby_column, strategy="mean", train=True, train_value=None):
        self.null_column = null_column
        self.groupby_column = groupby_column
        self.strategy = strategy
        self.train = train
        self.train_value = train_value

    def fit(self, X, y=None):

        # if imputation strategy is set to "mean"
        if self.strategy == "mean":

            # grouping by groupby_column, find mean of null_column
            self.train_value = (
                X[X[self.null_column].notnull()]
                .groupby(self.groupby_column)
                .mean()[self.null_column]
            )
            # calculate overall mean of null_column
            self.overall = X[X[self.null_column].notnull()][self.null_column].mean()

        # if imputation strategy is set to "median"
        elif self.strategy == "median":
            # grouping by groupby_column, find median of null_column
            self.train_value = (
                X[X[self.null_column].notnull()]
                .groupby(self.groupby_column)
                .median()[self.null_column]
            )
            # calculate overall median of null_column
            self.overall = X[X[self.null_column].notnull()][self.null_column].median()

        # if imputation strategy is set to "most_frequent"
        elif self.strategy == "most_frequent":
            # grouping by groupby_column, find mode of null_column
            self.train_value = (
                X[X[self.null_column].notnull()]
                .groupby(self.groupby_column)[self.null_column]
                .agg(lambda X: X.value_counts().index[0])
            )
            # calculate overall mode of null_column
            self.overall = X[X[self.null_column].notnull()][self.null_column].mode()[0]

        self.train_value = self.train_value.reset_index()

        return self

    def transform(self, X):
        # impute missing values based on train_value
        if isinstance(self.groupby_column, str):

            # impute nulls with corresponding value
            X[self.null_column] = np.where(
                X[self.null_column].isnull(),
                X[self.groupby_column].map(
                    self.train_value.set_index(self.groupby_column)[self.null_column]
                ),
                X[self.null_column],
            )

            # impute any remainig nulls with overall value
            X[self.null_column] = X[self.null_column].fillna(value=self.overall)

        return X[self.null_column]

In [5]:
train_df, test_df = load_data()

In [6]:
tot_col = len(train_df.columns)

In [7]:
tot_col

12

In [8]:
y_train = train_df["Survived"].copy()
X_train = train_df.copy()
X_train.drop(["Survived"], axis=1, inplace=True)

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [10]:
# y_test = test_df["Survived"].copy()
X_test = test_df.copy()
#X_test.drop(["Survived"], axis=1, inplace=True)

In [11]:
# These are columns we have decided a priori have no influence on the model

drop_col = ['PassengerId', 'Fare', 'Name', 'Ticket', 'Cabin', 'Embarked']

In [12]:
# Although read in as numeric, Pclass is actually categorical - either 1, 2 or 3

change_type = [("Pclass", "str")]

for col, type in change_type:
    X_train[[col]] = X_train[[col]].apply(str)

In [13]:
def get_num_cat_feat_after_drop(df, drop_col=None):
    try:    
        df.drop(columns=drop_col, axis=1, inplace=True)
    except Exception as KeyError:
        print("Error: Missing columns in DataFrame")
    num_feat = list(X_train.select_dtypes(include=['int64', 'float64']).columns)
    cat_feat = list(X_train.select_dtypes(include=['object']).columns)
    return num_feat, cat_feat

In [14]:
num_feat, cat_feat = get_num_cat_feat_after_drop(X_train, drop_col)

In [15]:
assert len(num_feat) + len(cat_feat) + len(drop_col) + 1 == tot_col

In [16]:
num_feat

['Age', 'SibSp', 'Parch']

In [17]:
X_train[num_feat].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     714 non-null    float64
 1   SibSp   891 non-null    int64  
 2   Parch   891 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 21.0 KB


In [18]:
cat_feat

['Pclass', 'Sex']

In [19]:
X_train[cat_feat].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pclass  891 non-null    object
 1   Sex     891 non-null    object
dtypes: object(2)
memory usage: 14.0+ KB


In [20]:
# TODO: Probably also drop Embarked as no (real) reason why it should influence survival

In [21]:
# Define pipeline for numeric features

step_impute_median = ('impute', SimpleImputer(strategy='median'))
step_scaler_std = ('scaler', StandardScaler())

# step_impute_median_grouped = ('GroupByImpute', GroupbyImputer(null_column="Age", groupby_column="SibSp"))

num_transform = Pipeline(steps=[
    step_impute_median,
    step_scaler_std])

In [22]:
# Define pipeline for categorical features

step_impute_freq = ('imputer', SimpleImputer(strategy='most_frequent'))
step_one_hot = ('one hot', OneHotEncoder(handle_unknown='ignore'))

cat_transform = Pipeline(steps=[
    step_impute_freq,
    step_one_hot])

In [23]:
# Create the overall pre-processing transformer

num_transformer = ('num', num_transform, num_feat)
cat_transformer = ('cat', cat_transform, cat_feat)

preprocess = ColumnTransformer(
    transformers=[
        num_transformer,
        cat_transformer
        ])


In [24]:
# Run the classifier

classify = LogisticRegression()

# Set the Pipeline

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classify', classify)])


In [25]:
pipe.fit(X_train, y_train)


In [26]:
pipe.score(X_train, y_train)


0.7912457912457912

In [27]:
pipe.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [32]:
numeric_transformer1 = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(['Age', 'Pclass', 'Sex'])),
    ('group by impute', GroupbyImputer('Age', 'Pclass'))
])

In [33]:
numeric_transformer1.fit(X_train)

In [34]:
numeric_transformer1.transform(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.null_column] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.null_column] = X[self.null_column].fillna(value=self.overall)


0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64