## Importing libraries and understanding the data

In [2]:
import numpy as np
import pandas as pd

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder

In [5]:
df = pd.read_csv("covid_toy.csv")

In [6]:
df.sample(10)

Unnamed: 0,age,gender,fever,cough,city,has_covid
91,38,Male,,Mild,Delhi,Yes
6,14,Male,101.0,Strong,Bangalore,No
14,51,Male,104.0,Mild,Bangalore,No
39,50,Female,103.0,Mild,Kolkata,No
98,5,Female,98.0,Strong,Mumbai,No
81,65,Male,99.0,Mild,Delhi,No
46,19,Female,101.0,Mild,Mumbai,No
20,12,Male,98.0,Strong,Bangalore,No
21,73,Male,98.0,Mild,Bangalore,Yes
65,69,Female,102.0,Mild,Bangalore,No


In [7]:
df['cough'].value_counts() #viewing the categories involved in the cough column. it needs to be converted into numerical values

Mild      62
Strong    38
Name: cough, dtype: int64

In [10]:
df['city'].value_counts() #viewing the categories involved in the city column. it needs to be converted into numerical values

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [9]:
df['city'].nunique()

4

In [18]:
df.isna().sum() #there are 10 null values in the fever column needs to take care of that as well

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [19]:
df.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


## Splitting the data into training and testing set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [22]:
X_train

Unnamed: 0,age,gender,fever,cough,city
61,81,Female,98.0,Strong,Mumbai
12,25,Female,99.0,Strong,Kolkata
20,12,Male,98.0,Strong,Bangalore
78,11,Male,100.0,Mild,Bangalore
9,64,Female,101.0,Mild,Delhi
...,...,...,...,...,...
48,66,Male,99.0,Strong,Bangalore
10,75,Female,,Mild,Delhi
35,82,Female,102.0,Strong,Bangalore
8,19,Female,100.0,Strong,Bangalore


In [23]:
X_test.shape

(20, 5)

## Preprocessing the data without using ColumnTransformer

Always remember whatever transformations you are applying on the training data, needs to be replicated on the testing data as well

In [24]:
#first we will use simple imputer to fill up the null values in the fever column
imputer = SimpleImputer()
X_train_fever = imputer.fit_transform(X_train[['fever']])
#applying on testing data
X_test_fever = imputer.fit_transform(X_test[['fever']])

In [26]:
X_train_fever[:10]

array([[ 98.],
       [ 99.],
       [ 98.],
       [100.],
       [101.],
       [104.],
       [101.],
       [103.],
       [102.],
       [ 98.]])

In [28]:
#Now we will encode cough column using ordinal encoding
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.fit_transform(X_test[['cough']])

In [30]:
X_train_cough[:10]

array([[1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [31]:
oe.get_feature_names_out()

array(['cough'], dtype=object)

In [38]:
#Now we will encode city,gender columns using one hot encoding
ohe = OneHotEncoder(drop='first',sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

In [39]:
X_train_gender_city[:10,:]

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 1., 0.]])

In [37]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
61,81,Female,98.0,Strong,Mumbai
12,25,Female,99.0,Strong,Kolkata
20,12,Male,98.0,Strong,Bangalore
78,11,Male,100.0,Mild,Bangalore
9,64,Female,101.0,Mild,Delhi


In [40]:
ohe.get_feature_names_out()

array(['gender_Male', 'city_Delhi', 'city_Kolkata', 'city_Mumbai'],
      dtype=object)

In [41]:
#Now we will be extracting age from the current dataframe
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

In [42]:
X_train_age[:10]

array([[81],
       [25],
       [12],
       [11],
       [64],
       [18],
       [19],
       [69],
       [49],
       [34]])

In [43]:
#now we will combine all of the numpy arrays to create our data frame
X_train_new = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
X_test_new = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

In [44]:
X_train_new.shape

(80, 7)

In [45]:
X_train_new[:10]

array([[ 81.,  98.,   0.,   0.,   0.,   1.,   1.],
       [ 25.,  99.,   0.,   0.,   1.,   0.,   1.],
       [ 12.,  98.,   1.,   0.,   0.,   0.,   1.],
       [ 11., 100.,   1.,   0.,   0.,   0.,   0.],
       [ 64., 101.,   0.,   1.,   0.,   0.,   0.],
       [ 18., 104.,   0.,   0.,   0.,   0.,   0.],
       [ 19., 101.,   0.,   0.,   0.,   1.,   0.],
       [ 69., 103.,   0.,   0.,   1.,   0.,   0.],
       [ 49., 102.,   0.,   1.,   0.,   0.,   0.],
       [ 34.,  98.,   1.,   0.,   1.,   0.,   1.]])

In [46]:
X_test_new[:10]

array([[ 83.        , 103.        ,   1.        ,   0.        ,
          1.        ,   0.        ,   0.        ],
       [ 34.        , 101.05882353,   0.        ,   0.        ,
          0.        ,   1.        ,   1.        ],
       [ 23.        , 101.05882353,   1.        ,   0.        ,
          0.        ,   1.        ,   0.        ],
       [ 20.        , 102.        ,   1.        ,   1.        ,
          0.        ,   0.        ,   1.        ],
       [ 10.        ,  98.        ,   0.        ,   0.        ,
          1.        ,   0.        ,   1.        ],
       [ 14.        , 101.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   1.        ],
       [ 83.        , 101.        ,   0.        ,   0.        ,
          1.        ,   0.        ,   0.        ],
       [ 14.        ,  99.        ,   0.        ,   0.        ,
          0.        ,   1.        ,   0.        ],
       [ 16.        , 104.        ,   1.        ,   0.        ,
          1.    

In [52]:
#Now we will try to convert it into the pandas dataframe once
df_transformed = pd.DataFrame(data=X_train_new,index=[i+1 for i in range(X_train_new.shape[0])],columns=['Age','Fever','Male','Delhi','Kolkata','Mumbai','Cough'])

In [54]:
df_transformed.head()

Unnamed: 0,Age,Fever,Male,Delhi,Kolkata,Mumbai,Cough
1,81.0,98.0,0.0,0.0,0.0,1.0,1.0
2,25.0,99.0,0.0,0.0,1.0,0.0,1.0
3,12.0,98.0,1.0,0.0,0.0,0.0,1.0
4,11.0,100.0,1.0,0.0,0.0,0.0,0.0
5,64.0,101.0,0.0,1.0,0.0,0.0,0.0


In [55]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
61,81,Female,98.0,Strong,Mumbai
12,25,Female,99.0,Strong,Kolkata
20,12,Male,98.0,Strong,Bangalore
78,11,Male,100.0,Mild,Bangalore
9,64,Female,101.0,Mild,Delhi


This is what feels like if we need to individually preprocess each and every column. Its time consuming and can be complex if we huge number of columns

## Preprocessing the data using ColumnTransformer

In [56]:
from sklearn.compose import ColumnTransformer

In [59]:
#applying all of our transformations that we did above
transformer = ColumnTransformer(
    transformers=[
        ('transform1',SimpleImputer(),['fever']),
        ('transform2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
        ('transform3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
    ],
    remainder='passthrough'
)

In [60]:
transformer.fit_transform(X_train)

array([[ 98.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  81.        ],
       [ 99.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  25.        ],
       [ 98.        ,   1.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  12.        ],
       [100.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  11.        ],
       [101.        ,   0.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  64.        ],
       [104.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  18.        ],
       [101.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  19.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  69.        ],
       [102.        ,   0.        ,   0.        ,   1.        ,
          0.    

In [61]:
transformer.get_feature_names_out()

array(['transform1__fever', 'transform2__cough',
       'transform3__gender_Male', 'transform3__city_Delhi',
       'transform3__city_Kolkata', 'transform3__city_Mumbai',
       'remainder__age'], dtype=object)

And its done, just two lines of code and we are done with our transformation :)