# Column Transformer
A column transformer allows application of different pre-prorcessing methods to the columns

In [1]:
import numpy as np
import pandas as pd

In [11]:
dict = {
    'nums': [1,2,3,4,np.nan,6,7,8,9,10],
    'cats': ['apple', 'ball', 'cat', 'dog', np.nan, 'fan', 'giraffe', 'hockey', 'ink', 'jacket']
}
df= pd.DataFrame(dict)
df

Unnamed: 0,nums,cats
0,1.0,apple
1,2.0,ball
2,3.0,cat
3,4.0,dog
4,,
5,6.0,fan
6,7.0,giraffe
7,8.0,hockey
8,9.0,ink
9,10.0,jacket


In [4]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(strategy='mean')
imp_cat = SimpleImputer(strategy='constant', fill_value='elephant')

In [14]:
from sklearn.compose import make_column_transformer
ct = make_column_transformer(
    (imp_mean, ['nums']),
    (imp_cat, ['cats']),
    remainder='passthrough'   
).set_output(transform='pandas')
# remainder='passthrough' doesn't affect the columns which are not mentioned in the transformer.
# Another option is remainder='drop' which deletes the unmentioned columns

In [18]:
df_transformed = ct.fit_transform(df)
df_transformed.columns = df.columns
df_transformed

Unnamed: 0,nums,cats
0,1.0,apple
1,2.0,ball
2,3.0,cat
3,4.0,dog
4,5.555556,elephant
5,6.0,fan
6,7.0,giraffe
7,8.0,hockey
8,9.0,ink
9,10.0,jacket


In [22]:
# Dropping selected remaining columns and keeping a few
dict2 = {
    'nums': [1,2,3,4,np.nan,6,7,8,9,10],
    'cats': ['apple', 'ball', 'cat', 'dog', np.nan, 'fan', 'giraffe', 'hockey', 'ink', 'jacket'],
    'colors': ['black', 'white', 'blue', 'green', 'yellow', 'orange', 'purple', 'red', 'pink', 'grey'],
    'professions': ['physician', 'aviator', 'teacher', 'engineer', 'midwife', 'manager', 'soldier', 'painter', 'musician', 'diplomat']
}
df2 = pd.DataFrame(dict2)
df2

Unnamed: 0,nums,cats,colors,professions
0,1.0,apple,black,physician
1,2.0,ball,white,aviator
2,3.0,cat,blue,teacher
3,4.0,dog,green,engineer
4,,,yellow,midwife
5,6.0,fan,orange,manager
6,7.0,giraffe,purple,soldier
7,8.0,hockey,red,painter
8,9.0,ink,pink,musician
9,10.0,jacket,grey,diplomat


In [24]:
ct2 = make_column_transformer(
    (imp_mean, ['nums']),
    (imp_cat, ['cats']),
    ('passthrough', ['professions']),
    remainder='drop'
).set_output(transform='pandas')
df2_transformed = ct2.fit_transform(df2)
df2_transformed.columns = ['nums', 'cats', 'professions']
df2_transformed

Unnamed: 0,nums,cats,professions
0,1.0,apple,physician
1,2.0,ball,aviator
2,3.0,cat,teacher
3,4.0,dog,engineer
4,5.555556,elephant,midwife
5,6.0,fan,manager
6,7.0,giraffe,soldier
7,8.0,hockey,painter
8,9.0,ink,musician
9,10.0,jacket,diplomat
