## Column Transformer

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer        # used to replace missing values in the column (mean, median, most_frequent, constant)
from sklearn.compose import ColumnTransformer   # used to make transformers

In [3]:
df = pd.read_csv("covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns='has_covid'), df['has_covid'], test_size=0.2)

print(df.isna().sum())      # only fever column contains null cells

"""
one hot encoding -- gender, city
simple imputer -- fever
ordinal encoding -- cough
label encoding -- has_covid

"""



age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64


'\none hot encoding -- gender, city\nsimple imputer -- fever\nordinal encoding -- cough\nlabel encoding -- has_covid\n\n'

### without using column transformer

In [5]:
# using simple imputer
si = SimpleImputer()
# imputer = SimpleImputer(strategy='most_frequent')             # [(mean, median, most_frequent, constant)]

x_train_fever = si.fit_transform(x_train[['fever']])        # SimpleImputer will replace all the missing values with mean of the column
x_test_fever = si.transform(x_test[['fever']])



# using ordinal encoding
oe = OrdinalEncoder(categories=[['Mild', 'Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])
x_test_cough = oe.transform(x_test[['cough']])



# using one hot encoding
ohe = OneHotEncoder(drop='first', sparse_output=False) # using k-1
x_train_gender_city = ohe.fit_transform(x_train[['gender', 'city']])
x_test_gender_city = ohe.transform(x_test[['gender', 'city']])
print(f"x_train_gender_city : {x_train_gender_city.shape}")         # it holds 80 rows and 4 columns

# extracting age
x_train_age = x_train.drop(columns=['gender',  'fever', 'cough', 'city']).values
x_test_age = x_test.drop(columns=['gender',  'fever', 'cough', 'city']).values


# adding all the data into a single array
x_train_transformed = np.concatenate((x_train_age, x_train_fever, x_train_gender_city, x_train_cough), axis=1)
x_test_transformed = np.concatenate((x_test_age, x_test_fever, x_test_gender_city, x_test_cough), axis=1)

pd.DataFrame(x_train_transformed)




x_train_gender_city : (80, 4)


Unnamed: 0,0,1,2,3,4,5,6
0,19.0,101.000000,0.0,0.0,0.0,1.0,0.0
1,23.0,100.943662,1.0,0.0,0.0,1.0,0.0
2,65.0,98.000000,0.0,0.0,0.0,1.0,0.0
3,38.0,101.000000,0.0,0.0,0.0,0.0,0.0
4,79.0,100.943662,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...
75,15.0,101.000000,1.0,1.0,0.0,0.0,0.0
76,71.0,100.943662,1.0,0.0,1.0,0.0,1.0
77,73.0,103.000000,0.0,1.0,0.0,0.0,0.0
78,12.0,98.000000,1.0,0.0,0.0,0.0,1.0


### using column transformer

In [None]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(), ['fever']),
    ('tnf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
    ('tnf3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'])
], remainder='passthrough')        # two options in 'remainder' -- drop /  passthrough

"""
 ---- WHAT DOES 'PASSTHROUGH' MEANS ---- 

It keeps all the columns that you did not list in the transformers.
It means "apply the specified transformations to fever, cough, gender, city".

Every other column in the dataframe should pass through unchanged into the output.

"""


x_train_final = transformer.fit_transform(x_train)
x_test_final = transformer.transform(x_train)

# converting to DataFrame with column names
cols = (
    ['fever', 'cough'] +
    list(transformer.named_transformers_['tnf3'].get_feature_names_out(['gender', 'city'])) +       # # get feature names from OneHotEncoder (tnf3)
    [col for col in x_train.columns if col not in ['fever', 'cough', 'gender', 'city']]
)



x_train_final = pd.DataFrame(x_train_final, columns=cols)
x_test_final = pd.DataFrame(x_test_final, columns=cols)

x_train_final.head()

Unnamed: 0,fever,cough,gender_Male,city_Delhi,city_Kolkata,city_Mumbai,age
0,101.0,0.0,0.0,0.0,0.0,1.0,19.0
1,100.943662,0.0,1.0,0.0,0.0,1.0,23.0
2,98.0,0.0,0.0,0.0,0.0,1.0,65.0
3,101.0,0.0,0.0,0.0,0.0,0.0,38.0
4,100.943662,1.0,1.0,0.0,1.0,0.0,79.0


In [7]:

"""
Here we are creating the final list of column names for our transformed DataFrame.

1. ['fever', 'cough']
   - These are the first two columns that we transformed using SimpleImputer and OrdinalEncoder.

2. list(transformer.named_transformers_['tnf3'].get_feature_names_out(['gender', 'city']))
   - 'tnf3' refers to the OneHotEncoder applied on ['gender', 'city'].
   - by using name_transformers_, we are accessing the tnf3. Its a tool to access the specific transformers
   - get_feature_names_out() automatically generates names for the new columns created after one-hot encoding.
     Example: ['gender_Male', 'city_Delhi', 'city_Mumbai']

3. [col for col in x_train.columns if col not in ['fever', 'cough', 'gender', 'city']]
   - This part adds back any columns that were not transformed (passed through because of remainder='passthrough').
   - Example: ['age']

The final 'cols' list combines all these parts, resulting in:
['fever', 'cough', 'gender_Male', 'city_Delhi', 'city_Mumbai', 'age']

______________________________________________________________________________________________________________________________

named_transformers_ is an attribute of ColumnTransformer.

After you fit a ColumnTransformer, it stores each transformer (like SimpleImputer, OrdinalEncoder, OneHotEncoder)
under the name you gave it inside the 'transformers' list.

Example:
transformer = ColumnTransformer([
    ('tnf1', SimpleImputer(), ['fever']),
    ('tnf2', OrdinalEncoder(), ['cough']),
    ('tnf3', OneHotEncoder(), ['gender', 'city'])
])

After fitting:
transformer.named_transformers_ gives you a dictionary like:
{
  'tnf1': SimpleImputer(...),
  'tnf2': OrdinalEncoder(...),
  'tnf3': OneHotEncoder(...)
}

So when we write transformer.named_transformers_['tnf3'],
we are directly accessing the fitted OneHotEncoder object by its name ('tnf3').
This allows us to call its methods, like get_feature_names_out(), to get the new encoded column names.
"""

"\nHere we are creating the final list of column names for our transformed DataFrame.\n\n1. ['fever', 'cough']\n   - These are the first two columns that we transformed using SimpleImputer and OrdinalEncoder.\n\n2. list(transformer.named_transformers_['tnf3'].get_feature_names_out(['gender', 'city']))\n   - 'tnf3' refers to the OneHotEncoder applied on ['gender', 'city'].\n   - by using name_transformers_, we are accessing the tnf3. Its a tool to access the specific transformers\n   - get_feature_names_out() automatically generates names for the new columns created after one-hot encoding.\n     Example: ['gender_Male', 'city_Delhi', 'city_Mumbai']\n\n3. [col for col in x_train.columns if col not in ['fever', 'cough', 'gender', 'city']]\n   - This part adds back any columns that were not transformed (passed through because of remainder='passthrough').\n   - Example: ['age']\n\nThe final 'cols' list combines all these parts, resulting in:\n['fever', 'cough', 'gender_Male', 'city_Delhi', 

In [8]:
"""
ColumnTransformer does not support direct dictionary-style access.

- Before fitting, you can't access individual transformers at all.
- After fitting, the fitted transformers are stored inside the attribute
  transformer.named_transformers_ as a dictionary.

So this works:
    transformer.named_transformers_['tnf3']

But this does NOT work:
    transformer['tnf3']

Reason:
ColumnTransformer is not a Python dictionary or sklearn Pipeline; it doesn’t implement __getitem__().
Only Pipeline supports that syntax (e.g., pipeline['scaler']).
"""


"\nColumnTransformer does not support direct dictionary-style access.\n\n- Before fitting, you can't access individual transformers at all.\n- After fitting, the fitted transformers are stored inside the attribute\n  transformer.named_transformers_ as a dictionary.\n\nSo this works:\n    transformer.named_transformers_['tnf3']\n\nBut this does NOT work:\n    transformer['tnf3']\n\nReason:\nColumnTransformer is not a Python dictionary or sklearn Pipeline; it doesn’t implement __getitem__().\nOnly Pipeline supports that syntax (e.g., pipeline['scaler']).\n"