In [19]:
import pandas as pd

data = {
    'name': ['John', 'Anna', 'Peter', 'Linda'],
    'age': [23, 36, None, 26],
    'gender': ['M', 'F', 'M', 'F'],
    'job': ['student', 'teacher', 'developer', 'nurse']
}

df = pd.DataFrame(data)

## Preprocessing Pipeline:

* Drop name feature
* Imputes ages
* Turn Gender binary / numeric
* one hot encode jobs

In [20]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# drop name
df = df.drop(["name"], axis= 1)

# impute ages

imputer = SimpleImputer(strategy='mean')
df["age"] = imputer.fit_transform(df[["age"]])

# numeric gender

gender_dict = {"M": 0, "F": 1}
df["gender"] = df["gender"].map(gender_dict)

# onehotencode jobs

encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[["job"]]).toarray()

column_names = ['student', 'teacher', 'developer', 'nurse']

for i in range(len(matrix.T)):
    df[column_names[i]] = matrix.T[i]

df = df.drop(['job'], axis=1)

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator,TransformerMixin):
    
    def fit(self,X, y= None):
        return self
    
    def transform(self, X:pd.DataFrame):
        return X.drop(['name'], axis = 1)
    
class AgeImputer(BaseEstimator,TransformerMixin):
    
    def fit(self,X, y= None):
        return self
    
    def transform(self, X:pd.DataFrame):
        imputer = SimpleImputer(strategy='mean')
        X["age"] = imputer.fit_transform(X[["age"]])
        return X
    
class FeactureEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self,X, y= None):
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[["job"]]).toarray()

        column_names = ['student', 'teacher', 'developer', 'nurse']

        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        return X.drop(['job'], axis=1)



In [30]:
data = {
    'name': ['John', 'Anna', 'Peter', 'Linda'],
    'age': [23, 36, None, 26],
    'gender': ['M', 'F', 'M', 'F'],
    'job': ['student', 'teacher', 'developer', 'nurse']
}


df2 = pd.DataFrame(data)


In [None]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeactureEncoder()

Unnamed: 0,age,gender,job
0,23.0,M,student
1,36.0,F,teacher
2,,M,developer
3,26.0,F,nurse


In [35]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("dropper", NameDropper()),
    ("imp", AgeImputer()),
    ("enc", FeactureEncoder())
])

pipe.fit_transform(df2)

Unnamed: 0,age,gender,student,teacher,developer,nurse
0,23.0,M,0.0,0.0,1.0,0.0
1,36.0,F,0.0,0.0,0.0,1.0
2,28.333333,M,1.0,0.0,0.0,0.0
3,26.0,F,0.0,1.0,0.0,0.0
