In [198]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [148]:
df = pd.read_csv("covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [150]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

#### Understanding out the columns
age, fever = numerical columns with fever having 10 missing values<br>
gender, city = Nominal categorical columns (OneHotEncoder)<br>
cough = Ordinal categorical column (OrdinalEncoder)<br>
has_covid = Ordinal categorical column (LabelEncoder)

In [152]:
y = df.has_covid
X = df.drop('has_covid', axis=1)

In [153]:
X.head()

Unnamed: 0,age,gender,fever,cough,city
0,60,Male,103.0,Mild,Kolkata
1,27,Male,100.0,Mild,Delhi
2,42,Male,101.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai


In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [155]:
X_test.shape

(20, 5)

<h3 style='color: red'>Performing Preprocessing for Individual columns</h3>

In [156]:
# imputing fever columns with mean values
imputer = SimpleImputer()
imputed_X_train_fever = imputer.fit_transform(X_train[["fever"]])
imputed_X_test_fever = imputer.transform(X_test[["fever"]])

In [157]:
df.cough.unique()

array(['Mild', 'Strong'], dtype=object)

In [158]:
# Ordinal Encoder for cough columns
oe = OrdinalEncoder(categories=[["Mild", "Strong"]])
imputed_X_train_cough = oe.fit_transform(X_train[["cough"]])
imputed_X_test_cough = oe.transform(X_test[["cough"]])

In [159]:
# OneHotEncoder for gender, city
ohe = OneHotEncoder(dtype='int32', sparse_output=False, drop='first')
imputed_X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])
imputed_X_test_gender_city = ohe.transform(X_test[["gender", "city"]])

In [160]:
imputed_X_test_gender_city.shape

(20, 4)

In [166]:
X_train_age = X_train.drop(columns=["city", "gender", "cough", "fever"]).values
X_test_age = X_test.drop(columns=["city", "gender", "cough", "fever"]).values

In [174]:
X_train_age.shape

(80, 1)

In [163]:
imputed_X_train_gender_city.ndim

2

In [192]:
preprocessed_X_train = np.concatenate((X_train_age, imputed_X_train_gender_city, imputed_X_train_fever, imputed_X_train_cough), axis=1)

In [194]:
preprocessed_X_train

array([[ 22.        ,   0.        ,   0.        ,   0.        ,
          0.        ,  99.        ,   0.        ],
       [ 56.        ,   0.        ,   0.        ,   0.        ,
          0.        , 104.        ,   1.        ],
       [ 31.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  98.        ,   0.        ],
       [ 75.        ,   0.        ,   1.        ,   0.        ,
          0.        , 104.        ,   1.        ],
       [ 72.        ,   1.        ,   0.        ,   0.        ,
          0.        ,  99.        ,   0.        ],
       [ 66.        ,   1.        ,   0.        ,   0.        ,
          0.        ,  99.        ,   1.        ],
       [ 14.        ,   1.        ,   0.        ,   0.        ,
          0.        , 101.        ,   1.        ],
       [ 10.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  98.        ,   1.        ],
       [ 24.        ,   1.        ,   0.        ,   1.        ,
          0.    

<h3 style='color: red'>Using Column Transformer</h3>

In [202]:
transformers = ColumnTransformer(transformers=[
    ('transformer1', SimpleImputer(), ["fever"]),
    ('transformer2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ["cough"]),
    ('transformer3', OneHotEncoder(sparse_output=False, dtype='int32', drop='first'), ["gender", "city"])
], remainder='passthrough')

In [None]:
transformers.fit_transform(X_train)
transformers.transform()