In [1]:
import numpy as np
import pandas as pd

In [2]:
col1 = np.random.randint(10, size=100)
col2 = np.random.randint(5, size=100)
col3 = np.random.random(100)
col4 = pd.Series(['a','b','c']).sample(100, replace=True)

In [3]:
target = np.random.randint(2, size=100)

In [4]:
df = pd.DataFrame(
    {
        'col1':col1,
        'col2':col2,
        'col3':col3,
        'col4':col4,
        'target':target
    },
)

In [5]:
df.head()

Unnamed: 0,col1,col2,col3,col4,target
2,8,0,0.561116,c,1
2,3,4,0.342765,c,0
1,5,0,0.909707,b,1
1,0,0,0.184894,b,1
2,2,3,0.928616,c,0


In [6]:

df.iloc[[1,5,8,15,54,62,99],:-1] = np.nan

In [7]:
df.isna().sum()

col1      7
col2      7
col3      7
col4      7
target    0
dtype: int64

In [10]:
df.head(20)

Unnamed: 0,col1,col2,col3,col4,target
2,8.0,0.0,0.561116,c,1
2,,,,,0
1,5.0,0.0,0.909707,b,1
1,0.0,0.0,0.184894,b,1
2,2.0,3.0,0.928616,c,0
0,,,,,0
0,3.0,1.0,0.478909,a,1
2,6.0,4.0,0.550093,c,1
2,,,,,1
2,7.0,4.0,0.692329,c,1


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [12]:
num_transformer = Pipeline(
    steps=
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
    ]
)

In [13]:
cat_transformer = Pipeline(
    steps=
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder())
    ]
)

In [14]:
numerical_features = df.iloc[:,:-1].select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.iloc[:,:-1].select_dtypes(include=['object']).columns

In [15]:
print(numerical_features)
print(categorical_features)

Index(['col1', 'col2', 'col3'], dtype='object')
Index(['col4'], dtype='object')


In [16]:
from sklearn.compose import ColumnTransformer

In [17]:
preprocess = ColumnTransformer(
    transformers=[
                    ('numeric', num_transformer, numerical_features),
                    ('categorical', cat_transformer, categorical_features)
    ]
)

In [18]:
from sklearn.linear_model import LogisticRegression
clf = Pipeline(
    [
        ('preprocess', preprocess),
        ('model', LogisticRegression())
    ]
)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X = df.drop('target', axis=1)
y = df['target']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  Index(['col1', 'col2', 'col3'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encode',
                                                                   OneHotEncoder())]),
                                                  Index(['col4'], 

In [23]:
clf.score(X_test, y_test)

0.4