# Script to incoporate the custom imputer into a sklearn pipeline

In [7]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('data.csv').iloc[:,1:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   boatClass    1000 non-null   object 
 1   boatLength   1000 non-null   int64  
 2   boatEngines  800 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ KB


### Define the imputer with your own custom logic

In [6]:
class MostCommonImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_with_gaps, reference_feature):
        """
        Reference feature is the other feature that's going to be used to inform the imputation of the missing feature
        """ 
        self.reference_feature = reference_feature
        self.feature_with_gaps = feature_with_gaps
        self.imputation_dict_ = {}
    
    def fit(self, X, y=None):
        """
        Need to extract the relevant properties of the data here to apply in the transformation
        """
        for value in X[self.reference_feature].unique():
            try:
                self.imputation_dict_[value] = X.loc[X[self.reference_feature] == value, self.feature_with_gaps].value_counts().index[0]
            except KeyError:
                self.imputation_dict_[value] = None
        
        return self
    
    def transform(self, X):
        """
        Need to use the properties learnt in .fit() and apply them to an input dataframe
        """
        #for ref_value in X[self.reference_feature].unique():
        #    X.loc[X[self.reference_feature] == ref_value, self.feature_with_gaps].fillna(self.imputation_dict_[ref_value], inplace=True)
        #    print(X.info())
        
        missing = X[self.feature_with_gaps].isna()
        X[self.feature_with_gaps + '_na'] = np.where(X[self.feature_with_gaps].isna(), 1, 0)
        X.loc[missing, self.feature_with_gaps] = X[self.reference_feature].map(self.imputation_dict_)
        
        return X

### Define the pipeline

In [9]:
pipe = Pipeline([('custom_imputer', MostCommonImputer('boatEngines', 'boatLength'))])
pipe

Pipeline(steps=[('custom_imputer',
                 MostCommonImputer(feature_with_gaps='boatEngines',
                                   reference_feature='boatLength'))])

In [10]:
pipe.fit_transform(df)

Unnamed: 0,boatClass,boatLength,boatEngines,boatEngines_na
0,C,10,1.0,0
1,D,45,4.0,0
2,D,23,3.0,0
3,A,0,4.0,0
4,D,7,1.0,0
...,...,...,...,...
995,C,57,3.0,0
996,B,50,0.0,0
997,D,50,3.0,0
998,E,24,0.0,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   boatClass       1000 non-null   object 
 1   boatLength      1000 non-null   int64  
 2   boatEngines     1000 non-null   float64
 3   boatEngines_na  1000 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 27.5+ KB


In [12]:
df['boatEngines_na'].sum()

200

### Ta da