# Script to make my own class that can be incorporated into an sklearn pipeline

In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_csv('data.csv').iloc[:,1:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   boatClass    1000 non-null   object 
 1   boatLength   1000 non-null   int64  
 2   boatEngines  800 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ KB


### Define a class that has the standard methods required by the sklearn API in order to be used as part of a pipeline

In [3]:
# By inheriting from BaseEstimator, TransformerMixin, we get access to the get_params and fit transform methods respectively
class MostCommonImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_with_gaps, reference_feature):
        """
        Reference feature is the other feature that's going to be used to inform the imputation of the missing feature
        """ 
        self.reference_feature = reference_feature
        self.feature_with_gaps = feature_with_gaps
        self.imputation_dict_ = {}
    
    def fit(self, X, y=None):
        """
        Need to extract the relevant properties of the data here to apply in the transformation
        """
        for value in X[self.reference_feature].unique():
            try:
                self.imputation_dict_[value] = X.loc[X[self.reference_feature] == value, self.feature_with_gaps].value_counts().index[0]
            except KeyError:
                self.imputation_dict_[value] = None
        
        return self
    
    def transform(self, X):
        """
        Need to use the properties learnt in .fit() and apply them to an input dataframe
        """
        #for ref_value in X[self.reference_feature].unique():
        #    X.loc[X[self.reference_feature] == ref_value, self.feature_with_gaps].fillna(self.imputation_dict_[ref_value], inplace=True)
        #    print(X.info())
        
        missing = X[self.feature_with_gaps].isna()
        X[self.feature_with_gaps + '_na'] = np.where(X[self.feature_with_gaps].isna(), 1, 0)
        X.loc[missing, self.feature_with_gaps] = X[self.reference_feature].map(self.imputation_dict_)
        
        return X

In [4]:
imputer = MostCommonImputer('boatEngines', 'boatLength')

In [5]:
imputer.fit(df)

MostCommonImputer(feature_with_gaps='boatEngines',
                  reference_feature='boatLength')

In [6]:
imputer.imputation_dict_

{10: 3.0,
 45: 4.0,
 23: 3.0,
 0: 3.0,
 7: 1.0,
 4: 1.0,
 28: 0.0,
 43: 1.0,
 6: 3.0,
 57: 2.0,
 42: 3.0,
 18: 1.0,
 9: 0.0,
 54: 0.0,
 55: 0.0,
 53: 0.0,
 50: 4.0,
 31: 1.0,
 16: 0.0,
 58: 4.0,
 29: 0.0,
 59: 2.0,
 44: 0.0,
 40: 3.0,
 52: 0.0,
 3: 3.0,
 48: 3.0,
 47: 2.0,
 22: 4.0,
 21: 3.0,
 26: 4.0,
 15: 4.0,
 11: 4.0,
 19: 0.0,
 2: 3.0,
 36: 4.0,
 41: 2.0,
 34: 3.0,
 20: 3.0,
 17: 2.0,
 8: 3.0,
 51: 0.0,
 56: 3.0,
 46: 0.0,
 25: 4.0,
 24: 0.0,
 33: 3.0,
 5: 0.0,
 37: 3.0,
 30: 0.0,
 12: 4.0,
 1: 3.0,
 27: 2.0,
 49: 3.0,
 32: 3.0,
 35: 3.0,
 39: 0.0,
 38: 0.0,
 14: 4.0,
 13: 0.0}

In [7]:
imputer.transform(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   boatClass       1000 non-null   object 
 1   boatLength      1000 non-null   int64  
 2   boatEngines     1000 non-null   float64
 3   boatEngines_na  1000 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 27.5+ KB


In [8]:
df['boatEngines_na'].sum()

200

### So N values have been filled in by the custom class imputer with the dictinary found by the .fit()

### Start this again and use the .fit_transform() method that's been inherited from the base classes

In [9]:
df = pd.read_csv('data.csv').iloc[:,1:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   boatClass    1000 non-null   object 
 1   boatLength   1000 non-null   int64  
 2   boatEngines  800 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ KB


In [10]:
imputer = MostCommonImputer('boatEngines', 'boatLength')
imputer.fit_transform(df)

Unnamed: 0,boatClass,boatLength,boatEngines,boatEngines_na
0,C,10,1.0,0
1,D,45,4.0,0
2,D,23,3.0,0
3,A,0,4.0,0
4,D,7,1.0,0
...,...,...,...,...
995,C,57,3.0,0
996,B,50,0.0,0
997,D,50,3.0,0
998,E,24,0.0,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   boatClass       1000 non-null   object 
 1   boatLength      1000 non-null   int64  
 2   boatEngines     1000 non-null   float64
 3   boatEngines_na  1000 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 27.5+ KB
