In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression

In [2]:
from sklearn.base import TransformerMixin

class FillTransformer(TransformerMixin):
    def __init__(self):
        super().__init__()
        self._columns = []
    
    def fit(self, X, y=None):
        self._columns = X.columns
        return self
    def transform(self, X):
        if type(X) is not pd.DataFrame:
            X = pd.DataFrame(X, columns=self._columns)
        return X.fillna(0)

In [3]:
df = pd.DataFrame(
    [
        ['Azul', 1, 0],
        ['Verde', 2, 0],
        ['Amarillo', np.nan, 1],
        ['Morado', 4, 1],
        ['Azul', 4, 1],
    ],
    columns=['Color', 'Valor', 'target']
)

df

Unnamed: 0,Color,Valor,target
0,Azul,1.0,0
1,Verde,2.0,0
2,Amarillo,,1
3,Morado,4.0,1
4,Azul,4.0,1


In [4]:
preprocessor = ColumnTransformer(  # split columns by type
    transformers=[
        ('num', StandardScaler(), [1]),  # (name, transformer, columns)
        ('cat', OneHotEncoder(handle_unknown='ignore'), [0])
    ]
)

In [5]:
pl = Pipeline(
    [
        ('fillna', FillTransformer()),  # fills na with zeros
        ('preprocess', preprocessor),  # transforms features depending on type
        ('classifier', LinearRegression())  # predicts based on transformed features
    ]
)

In [6]:
pl.fit(
    df.fillna(0).drop(columns='target'),  # X
    df['target']  # y
)

Pipeline(steps=[('fillna', <__main__.FillTransformer object at 0x7ff5992ad5c0>),
                ('preprocess',
                 ColumnTransformer(transformers=[('num', StandardScaler(), [1]),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  [0])])),
                ('classifier', LinearRegression())])

In [7]:
pl.predict(
    [
        ['Amarillo', 1],
        ['Verde', np.nan]
    ]
)

array([ 1.33333333, -0.66666667])