In [96]:
import pandas as pd
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [97]:
df=pd.read_csv("sample_dataset.csv")

In [98]:
X=df.iloc[:,0:-1]
y=df.iloc[:,-1]

Numerical Variables
- Blank filling with median value
- Standardization

Categorical Variables
- Blank filling with most frequent value
- One hot encoding

In [99]:
transformer = ColumnTransformer([
    ('numerical', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(dtype_exclude='object')),
    
    ('categorical', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(sparse_output=False))
    ]), make_column_selector(dtype_include='object'))
])

# PCA

In [100]:
pca=PCA(n_components=10)

# Feature Selection

In [101]:
selector=SelectKBest(f_classif,k=5)

# Pipeline

In [102]:
pipeline=Pipeline([
    ('transformation',transformer),
    ('pca',pca),
    ('feature_selection',selector)
])

In [103]:
pipeline.fit_transform(X,y)

array([[ 8.52256696,  2.64397044, -1.57565677, -3.52420989, -2.60947195],
       [ 2.79623958, -3.89824767,  0.10426929, -1.61476221, -0.15804423],
       [ 4.56985298, -1.18416154, -0.23154002, -0.95078422,  0.12680809],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438,  2.06978788,  1.97780945],
       [10.21620878,  0.39525002, -2.47257669,  1.09076934, -0.72400782],
       [-5.32259512, -0.24553988,  1.22278786,  1.40541489,  0.49452765]],
      shape=(569, 5))

In [104]:
X1=transformer.fit_transform(X)
X2=pca.fit_transform(X1)
X3=selector.fit_transform(X2,y)

In [105]:
X3

array([[ 8.52256696,  2.64397044, -1.57565677, -3.52420989, -2.60947195],
       [ 2.79623958, -3.89824767,  0.10426929, -1.61476221, -0.15804423],
       [ 4.56985298, -1.18416154, -0.23154002, -0.95078422,  0.12680809],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438,  2.06978788,  1.97780945],
       [10.21620878,  0.39525002, -2.47257669,  1.09076934, -0.72400782],
       [-5.32259512, -0.24553988,  1.22278786,  1.40541489,  0.49452765]],
      shape=(569, 5))

In [106]:
# customization
pipeline.set_params(pca__n_components=15,feature_selection__k=3)

In [107]:
pipeline.fit_transform(X,y)

array([[ 8.52256696,  2.64397044, -1.57565677],
       [ 2.79623958, -3.89824767,  0.10426929],
       [ 4.56985298, -1.18416154, -0.23154002],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438],
       [10.21620878,  0.39525002, -2.47257669],
       [-5.32259512, -0.24553988,  1.22278786]], shape=(569, 3))

In [108]:
# Correct parameter path
pipeline.set_params(transformation__numerical__imputer__strategy='mean')

In [109]:
pipeline.fit_transform(X,y)

array([[ 8.53475991e+00,  2.61353308e+00, -1.49046306e+00],
       [ 2.73326666e+00, -3.71106307e+00, -1.19923237e-03],
       [ 4.63649131e+00, -1.19050208e+00, -2.87695276e-01],
       ...,
       [ 1.08873883e+00, -2.15395811e+00,  1.16642387e+00],
       [ 1.01791411e+01,  5.24429704e-01, -2.43315693e+00],
       [-5.31069499e+00, -2.64129617e-01,  1.39740879e+00]],
      shape=(569, 3))