In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("../files/covid_toy.csv")

In [5]:
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


**Column Transformer**

*`ColumnTransformer` in Scikit-learn is used to apply different transformations to different columns in a dataset. It’s especially useful when working with mixed data types (numerical and categorical).*

In [4]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data.iloc[:, 0:5],
    data.iloc[:, -1],
    test_size=0.2,
    random_state=5,
)

In [7]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
94,79,Male,,Strong,Kolkata
56,71,Male,,Strong,Kolkata
22,71,Female,98.0,Strong,Kolkata
39,50,Female,103.0,Mild,Kolkata
24,13,Female,100.0,Strong,Kolkata


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [24]:
transformer = ColumnTransformer(
    transformers=[
        (
            "tnf1",
            SimpleImputer(),
            ["fever"],
        ),
        (
            "tnf2",
            OneHotEncoder(sparse_output=False, drop="first"),
            ["gender", "city"],
        ),
        (
            "tnf3",
            OrdinalEncoder(categories=[["Mild", "Strong"]]),
            ["cough"],
        ),
    ],
    remainder="passthrough",
)

# transformers is a list which takes tuples
# tuple takes parameters: name_of_transformer, technique, columns

In [None]:
data1 = transformer.fit_transform(X_train)

In [28]:
data1.shape

(80, 7)