In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


In [3]:
df = pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.describe

<bound method NDFrame.describe of     age  gender  fever   cough       city has_covid
0    60    Male  103.0    Mild    Kolkata        No
1    27    Male  100.0    Mild      Delhi       Yes
2    42    Male  101.0    Mild      Delhi        No
3    31  Female   98.0    Mild    Kolkata        No
4    65  Female  101.0    Mild     Mumbai        No
..  ...     ...    ...     ...        ...       ...
95   12  Female  104.0    Mild  Bangalore        No
96   51  Female  101.0  Strong    Kolkata       Yes
97   20  Female  101.0    Mild  Bangalore        No
98    5  Female   98.0  Strong     Mumbai        No
99   10  Female   98.0  Strong    Kolkata       Yes

[100 rows x 6 columns]>

In [5]:
df.isnull().sum()
print(df.shape)

(100, 6)


In [6]:
df.notna().sum()

age          100
gender       100
fever         90
cough        100
city         100
has_covid    100
dtype: int64

**It means in 100 rows of all the above features mostly all the columns are filled but the fever column has 10 empty rows.**


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['has_covid']), df['has_covid'], test_size=0.2)

In [8]:
print(X_train.shape,
      X_test.shape,
      y_train.shape,
       y_test.shape)

''' now we have all the columns except our target that is "has_covid" column in our X & y  is for target.'''

(80, 5) (20, 5) (80,) (20,)


' now we have all the columns except our target that is "has_covid" column in our X & y  is for target.'

In [9]:
# now in this case we can use SimpleImputer to fill these missing values  in "Fever" column 

imputer = SimpleImputer()
X_train_fever = imputer.fit_transform(X_train[['fever']])

X_test_fever = imputer.fit_transform(X_test[['fever']])

X_train_fever.shape


(80, 1)

In [10]:
# Ordinal Encoding 
X_train['cough']
ordinal= OrdinalEncoder(categories=[['Mild', 'Strong']])
X_train_cough = ordinal.fit_transform(X_train[['cough']])

X_test_cough = ordinal.fit_transform(X_test[['cough']])




In [11]:
# One Hot encoding = columns will give theri respective valeus as new columns

ohe = OneHotEncoder(drop='first',sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender', 'city']])
X_test_gender_city = ohe.fit_transform(X_test[['gender', 'city']])
X_train_gender_city.shape

(80, 4)

In [12]:
X_train_gender_city

# so here we will see both columns gender & city after one hot encoded.

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [1., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 1., 0.],


In [13]:
# now extracting back the columns
X_train_age = X_train.drop(columns=['gender', 'city', 'cough', 'fever']).values
X_test_age =  X_test.drop(columns=['gender', 'city', 'cough', 'fever']).values
X_train_age.shape

(80, 1)

In [14]:
# combining all the transformed columns into a new DF

X_train_transformed = np.concatenate((X_train_age,X_train_gender_city, X_train_fever, X_train_cough), axis=1)
X_test_transformed = np.concatenate((X_test_age,X_test_gender_city, X_test_fever, X_test_cough), axis=1)


In [15]:
X_train_transformed.shape

(80, 7)

## Now using the ColumnTransformer from SKLEARN.COMPOSE

In [16]:
from sklearn.compose import ColumnTransformer

In [17]:
transformer = ColumnTransformer(transformers=[
    ('trf1', SimpleImputer(), ['fever']),
    ('trf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
    ('trf3', OneHotEncoder(sparse_output=False, drop ='first'), ['gender', 'city'])
],   remainder='passthrough')


# here remaining columns are passed else we can drop them also by passing the remainder ='drop' argument.


In [18]:
transformer.fit_transform(X_train).shape

(80, 7)

In [19]:
transformer.fit_transform(X_test).shape

(20, 7)