The ColumnTransformer is a tool in machine learning provided by Scikit-learn that allows you to apply different preprocessing techniques to specific columns of your dataset

Apply scaling to numerical features :- StandardScaler,OrdinalEncoder(categorical to numerical)
Apply encoding to categorical features.:- OneHotEncoder

In [34]:
# The SimpleImputer is a preprocessing tool in Scikit-learn used to handle missing data in a dataset. It provides an easy way to fill (impute) missing values with a specified strategy, 
# such as replacing them with the mean, median, mode (most frequent value), or a constant value.

In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [6]:
df = pd.read_csv('covid_toy.csv')

In [7]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [8]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [10]:
y_test

25     No
65     No
80    Yes
62    Yes
11    Yes
69     No
34    Yes
60    Yes
22    Yes
12     No
46     No
98     No
85    Yes
29    Yes
61     No
88     No
38    Yes
6      No
9      No
95     No
Name: has_covid, dtype: object

NORMLA_WAY

# Both OrdinalEncoder and OneHotEncoder are preprocessing tools in Scikit-learn used to transform categorical data into numerical format. 
# However, they serve different purposes and are used in different scenarios

In [23]:
# use the SimpleImputer to remove the null value in age with their mean,median,mode value
si =SimpleImputer()
x_train_fever = si.fit_transform(x_train[['fever']])

x_test_fever = si.fit_transform(x_test[['fever']])

# x_train_si

In [24]:
# Initialize the OrdinalEncoder with specified category order
oe = OrdinalEncoder(categories=[['Mild', 'Strong']])
# Fit and transform the 'cough' column in the training set
x_train_cough = oe.fit_transform(x_train[['cough']])
# Transform the 'cough' column in the test set using the already-fitted encoder
x_test_cough = oe.transform(x_test[['cough']])

# OneHotEncoding -> gender,city

In [25]:
ohe = OneHotEncoder(drop='first',sparse=False)
x_train_gender_city = ohe.fit_transform(x_train[['gender','city']])

# also the test data
x_test_gender_city = ohe.fit_transform(x_test[['gender','city']])

x_train_gender_city.shape



(80, 4)

<!-- Extract Age -->

# Extract Age

In [26]:
x_train_age = x_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
x_test_age = x_test.drop(columns=['gender','fever','cough','city']).values

x_train_age.shape

(80, 1)

# concat all 

In [27]:
x_train_transformed = np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis=1)
# also the test data
# x_test_transformed = np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)

x_train_transformed.shape

(80, 7)

# MENTOS ZINDAGI

In [28]:
from sklearn.compose import ColumnTransformer

In [29]:
transformer = ColumnTransformer(transformers=[
    ('tn1',SimpleImputer(),['fever']),
    ('tn2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tn3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [30]:
transformer

In [31]:
transformer.fit_transform(x_train).shape



(80, 7)

In [32]:
transformer.transform(x_test).shape

(20, 7)