In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [5]:
df = pd.read_csv('covid_toy.csv')

In [6]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [8]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [9]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [10]:
#Spliting the data
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,Y_train,Y_test = train_test_split(df.drop(['has_covid'],axis=1),df['has_covid'],test_size=0.3,random_state=None)

In [12]:
X_train

Unnamed: 0,age,gender,fever,cough,city
60,24,Female,102.0,Strong,Bangalore
12,25,Female,99.0,Strong,Kolkata
11,65,Female,98.0,Mild,Mumbai
72,83,Female,101.0,Mild,Kolkata
25,23,Male,,Mild,Mumbai
...,...,...,...,...,...
8,19,Female,100.0,Strong,Bangalore
48,66,Male,99.0,Strong,Bangalore
38,49,Female,101.0,Mild,Delhi
41,82,Male,,Mild,Kolkata


1. Aam Jindgi

In [13]:
#Simeple imputer will replace all the missing value with the value of the Mean of that column
si = SimpleImputer()
si.fit(X_train[['fever']])
X_train_fever = si.transform(X_train[['fever']])
X_test_fever = si.transform(X_test[['fever']])

X_train_fever

array([[102.      ],
       [ 99.      ],
       [ 98.      ],
       [101.      ],
       [100.765625],
       [ 98.      ],
       [100.      ],
       [103.      ],
       [ 98.      ],
       [ 99.      ],
       [ 98.      ],
       [102.      ],
       [101.      ],
       [101.      ],
       [103.      ],
       [ 99.      ],
       [ 99.      ],
       [100.      ],
       [101.      ],
       [100.      ],
       [103.      ],
       [104.      ],
       [104.      ],
       [104.      ],
       [ 98.      ],
       [ 99.      ],
       [100.      ],
       [ 98.      ],
       [ 98.      ],
       [102.      ],
       [104.      ],
       [100.      ],
       [102.      ],
       [ 98.      ],
       [ 98.      ],
       [101.      ],
       [ 99.      ],
       [ 98.      ],
       [100.      ],
       [104.      ],
       [101.      ],
       [104.      ],
       [ 98.      ],
       [100.765625],
       [100.      ],
       [104.      ],
       [100.      ],
       [104. 

In [14]:
#Ordinal Encoding - Ordinal categorical column
OE = OrdinalEncoder(categories=[['Mild','Strong']])
OE.fit(X_train[['cough']])
X_train_cough = OE.transform(X_train[['cough']])
X_test_cough = OE.transform(X_test[['cough']])
X_train_cough.shape

(70, 1)

In [15]:
#One Hot Encoding - Nominal Categorical Data
OHE = OneHotEncoder(drop = 'first',sparse_output=False)
OHE.fit(X_train[['city','gender']])
X_train_city_gender = OHE.transform(X_train[['city','gender']])
X_test_city_gender = OHE.transform(X_test[['city','gender']])

In [16]:
X_train_city_gender.shape

(70, 4)

In [17]:
#Extrating the Age
X_train_age = X_train.drop(columns = ['gender','fever','cough','city']).values
X_test_age = X_test.drop(columns = ['gender','fever','cough','city']).values

In [18]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_city_gender,X_train_cough),axis = 1)
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_city_gender,X_test_cough),axis = 1)

In [19]:
X_train_transformed

array([[ 24.      , 102.      ,   0.      ,   0.      ,   0.      ,
          0.      ,   1.      ],
       [ 25.      ,  99.      ,   0.      ,   1.      ,   0.      ,
          0.      ,   1.      ],
       [ 65.      ,  98.      ,   0.      ,   0.      ,   1.      ,
          0.      ,   0.      ],
       [ 83.      , 101.      ,   0.      ,   1.      ,   0.      ,
          0.      ,   0.      ],
       [ 23.      , 100.765625,   0.      ,   0.      ,   1.      ,
          1.      ,   0.      ],
       [  5.      ,  98.      ,   0.      ,   0.      ,   1.      ,
          0.      ,   1.      ],
       [ 55.      , 100.      ,   0.      ,   1.      ,   0.      ,
          1.      ,   0.      ],
       [ 60.      , 103.      ,   0.      ,   1.      ,   0.      ,
          1.      ,   0.      ],
       [ 24.      ,  98.      ,   0.      ,   1.      ,   0.      ,
          1.      ,   0.      ],
       [ 14.      ,  99.      ,   0.      ,   0.      ,   1.      ,
          0.      ,   0

Mentos Zindgi

In [20]:
from sklearn.compose import ColumnTransformer

In [21]:
transformer = ColumnTransformer(transformers=[('tnf1',SimpleImputer(),['fever']),
                                   ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
                                   ('tnf3',OneHotEncoder(sparse_output=False,drop = 'first'),['gender','city'])],
                                   remainder='passthrough')

In [22]:
transformer.fit(X_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [24]:
transformer.transform(X_train).shape
transformer.transform(X_test).shape

(30, 7)