In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer #imported SimpleImputer class because 
#we have missing values
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [4]:
df = pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [6]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [7]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [9]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df.drop(columns=['has_covid']),
                                                 df['has_covid'],test_size=0.2)

In [14]:
from sklearn.compose import ColumnTransformer

In [23]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']), #fillled missing values
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']), #ordered text to numbers i.e. 0 and 1
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city']) #nominal data into binary data
],remainder='passthrough')

In [31]:
array = transformer.fit_transform(X_train) #converted whole dataset into numbers without 
#changes in actual information
df = pd.DataFrame(array)
print(df)

             0    1    2    3    4    5     6
0   100.971831  0.0  0.0  0.0  0.0  0.0  84.0
1   104.000000  1.0  0.0  1.0  0.0  0.0  75.0
2   103.000000  0.0  0.0  0.0  1.0  0.0  69.0
3   101.000000  0.0  0.0  0.0  0.0  1.0  65.0
4    98.000000  1.0  0.0  0.0  0.0  1.0   5.0
..         ...  ...  ...  ...  ...  ...   ...
75  101.000000  1.0  0.0  1.0  0.0  0.0  68.0
76  101.000000  0.0  1.0  1.0  0.0  0.0  19.0
77   98.000000  0.0  0.0  0.0  1.0  0.0  26.0
78  101.000000  0.0  0.0  0.0  1.0  0.0  83.0
79  103.000000  0.0  0.0  0.0  1.0  0.0  50.0

[80 rows x 7 columns]


In [30]:
transformer.transform(X_test)

array([[101.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  81.        ],
       [102.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  82.        ],
       [ 98.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  73.        ],
       [102.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  64.        ],
       [100.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  55.        ],
       [100.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,   5.        ],
       [101.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  15.        ],
       [ 98.        ,   1.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  69.        ],
       [100.97183099,   0.        ,   1.        ,   0.        ,
          0.    