See: https://towardsdatascience.com/creating-custom-transformers-for-sklearn-pipelines-d3d51852ecc1

and https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [96]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import check_is_fitted
from typing import List

from app.src.datasource import load_data

In [3]:
# DataFrameColumnSelector class inherits from the sklearn.base classes
# (BaseEstimator, TransformerMixin)

class DataFrameColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # return the dataframe with the specified features
        return X[self.columns]


In [123]:
# Ref: https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de

class GroupbyImputer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str
        The metric to be used for replacement, can be one of ['mean', 'median']
    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target, metric='mean'):
        
        assert metric in ['mean', 'median']  # 'Unrecognized value for metric, should be mean/median'
        # print(type(group_cols)) # == List      # 'group_cols should be a list of columns'
        # assert type(target) == str           # 'target should be a string'
        
        self.group_cols = group_cols
        self.target = target
        self.metric = metric
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'

        if len(self.group_cols) == 0:
            impute_map = X.agg(self.metric)[self.target]
        else:
            impute_map = X.groupby(self.group_cols)[self.target].agg(self.metric) \
                                                            .reset_index(drop=False)
        print(impute_map)
        self.impute_map_ = impute_map
        return self 
    
    def transform(self, X, y=None):
        check_is_fitted(self, 'impute_map_')
        X = X.copy()  
        if len(self.group_cols) == 0:
            X.loc[:, self.target] = X.loc[:, self.target].fillna(self.impute_map_)
        else:
            for index, row in self.impute_map_.iterrows():
                ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
                X.loc[ind, self.target] = X.loc[ind, self.target].fillna(row[self.target])
        return X.values

In [68]:
train_df, test_df = load_data()

In [69]:
tot_col = len(train_df.columns)

In [70]:
tot_col

12

In [71]:
y_train = train_df["Survived"].copy()
X_train = train_df.copy()
X_train.drop(["Survived"], axis=1, inplace=True)

In [72]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [73]:
# y_test = test_df["Survived"].copy()
X_test = test_df.copy()
#X_test.drop(["Survived"], axis=1, inplace=True)

In [74]:
# These are columns we have decided a priori have no influence on the model

drop_col = ['PassengerId', 'Fare', 'Name', 'Ticket', 'Cabin', 'Embarked']

In [75]:
# Although read in as numeric, Pclass is actually categorical - either 1, 2 or 3

change_type = [("Pclass", "str")]

for col, type in change_type:
    col_name = f"{col}_{type}"
    X_train[[col_name]] = X_train[[col]].apply(str)

In [76]:
def get_num_cat_feat_after_drop(df, drop_col=None):
    try:    
        df.drop(columns=drop_col, axis=1, inplace=True)
    except Exception as KeyError:
        print("Error: Missing columns in DataFrame")
    num_feat = list(X_train.select_dtypes(include=['int64', 'float64']).columns)
    cat_feat = list(X_train.select_dtypes(include=['object']).columns)
    return num_feat, cat_feat

In [77]:
num_feat, cat_feat = get_num_cat_feat_after_drop(X_train, drop_col)

In [78]:
assert len(num_feat) + len(cat_feat) + len(drop_col) + 1 == tot_col

AssertionError: 

In [79]:
num_feat

['Pclass', 'Age', 'SibSp', 'Parch']

In [80]:
X_train[num_feat].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    int64  
 1   Age     714 non-null    float64
 2   SibSp   891 non-null    int64  
 3   Parch   891 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 28.0 KB


In [81]:
cat_feat

['Sex', 'Pclass_str']

In [82]:
X_train[cat_feat].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sex         891 non-null    object
 1   Pclass_str  891 non-null    object
dtypes: object(2)
memory usage: 14.0+ KB


In [83]:
# TODO: Probably also drop Embarked as no (real) reason why it should influence survival

In [131]:
impute_groupby_col = ["Pclass", "Sex"]

In [132]:
# Define pipeline for numeric features

step_impute_median = ('impute', SimpleImputer(strategy='median'))
step_scaler_std = ('scaler', StandardScaler())

step_impute_grouped_median = ('GroupbyImpute', GroupbyImputer(group_cols=impute_groupby_col, target="Age", metric="median"))

num_transform = Pipeline(steps=[
    step_impute_grouped_median,
    step_scaler_std])

In [133]:
# Define pipeline for categorical features

step_impute_freq = ('imputer', SimpleImputer(strategy='most_frequent'))
step_one_hot = ('one hot', OneHotEncoder(handle_unknown='ignore'))

cat_transform = Pipeline(steps=[
    step_impute_freq,
    step_one_hot])

In [137]:
num_feat

['Pclass', 'Age', 'SibSp', 'Parch']

In [138]:
impute_groupby_col

['Pclass', 'Sex']

In [140]:
list(set(num_feat + impute_groupby_col))

['Age', 'Parch', 'Sex', 'Pclass', 'SibSp']

In [141]:
# Create the overall pre-processing transformer

num_transformer = ('num', num_transform, list(set(num_feat + impute_groupby_col)))
cat_transformer = ('cat', cat_transform, cat_feat)

preprocess = ColumnTransformer(
    transformers=[
        num_transformer,
        cat_transformer
        ])


In [142]:
# Run the classifier

classify = LogisticRegression()

# Set the Pipeline

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classify', classify)])


In [143]:
pipe.fit(X_train, y_train)


   Pclass     Sex   Age
0       1  female  35.0
1       1    male  40.0
2       2  female  28.0
3       2    male  30.0
4       3  female  21.5
5       3    male  25.0


ValueError: could not convert string to float: 'male'

In [130]:
pipe.score(X_train, y_train)


0.7890011223344556

In [27]:
pipe.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [32]:
numeric_transformer1 = Pipeline(steps=[
    ('columns selector', DataFrameColumnSelector(['Age', 'Pclass', 'Sex'])),
    ('group by impute', GroupbyImputer('Age', 'Pclass'))
])

In [33]:
numeric_transformer1.fit(X_train)

In [34]:
numeric_transformer1.transform(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.null_column] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.null_column] = X[self.null_column].fillna(value=self.overall)


0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64