# **Column Transformer**


## **Import Required Libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

## **Load and Explore Dataset**

In [2]:
df = pd.read_csv('covid_toy.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

## **Split Data into Train and Test Sets**

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [6]:
X_train

Unnamed: 0,age,gender,fever,cough,city
31,83,Male,103.0,Mild,Kolkata
41,82,Male,,Mild,Kolkata
53,83,Male,98.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
30,15,Male,101.0,Mild,Delhi
...,...,...,...,...,...
92,82,Female,102.0,Strong,Kolkata
88,5,Female,100.0,Mild,Kolkata
37,55,Male,100.0,Mild,Kolkata
86,25,Male,104.0,Mild,Bangalore


## **Column-Transformer**

The `ColumnTransformer` automates the above process and handles multiple transformations in a single pipeline.
- **Cleaner Code:** No manual concatenation required
- **Prevents Data Leakage:** Automatically uses `fit()` on train and `transform()` on test
- **Maintainability:** Easy to modify transformations
- **Scalability:** Handles complex preprocessing pipelines efficiently

In [7]:
from sklearn.compose import ColumnTransformer

In [8]:
transformer = ColumnTransformer(transformers=[
    ('fever_imputer', SimpleImputer(), ['fever']),  # Handle missing values
    ('cough_encoder', OrdinalEncoder(categories=[['Mild','Strong']]), ['cough']),  # Ordinal encoding
    ('categorical_encoder', OneHotEncoder(sparse_output=False, drop='first'), ['gender','city'])  # One-hot encoding
], remainder='passthrough')  # Pass through remaining columns (age)

In [9]:
transformer.fit_transform(X_train).shape

(80, 7)

In [10]:
transformer.transform(X_test).shape

(20, 7)

# Second Method

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

# reproducible split (and stratify if target is imbalanced)
X = df.drop(columns=['has_covid'])
y = df['has_covid']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# define feature groups
numeric_features = ['age', 'fever']
ordinal_features = ['cough']
ordinal_categories = [['Mild','Strong']]
categorical_features = ['gender', 'city']

# transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# fit on train only
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# inspect feature names (scikit-learn >=1.0)
try:
    print(preprocessor.get_feature_names_out())
except Exception:
    print('get_feature_names_out not available for this scikit-learn version')

['num__age' 'num__fever' 'ord__cough' 'cat__gender_Male' 'cat__city_Delhi'
 'cat__city_Kolkata' 'cat__city_Mumbai']
