# Importing library

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Importing Dataset 

In [2]:
df = pd.read_csv('covid_toy.csv')
df.head()
                 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


Gender and City are Nominal data wo se can apply OneHotEncodind scheme to them 

cough is ordinal data so we can apply ordinal encoding scheme to it 

has_covid is output column so we can apply label encoding to it 

# 1.train_test_split

In [3]:
from sklearn.model_selection import train_test_split

In [5]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('has_covid',axis=1),
                                                df['has_covid'],test_size=0.2)

In [6]:
X_train.head()   # input variables 

Unnamed: 0,age,gender,fever,cough,city
86,25,Male,104.0,Mild,Bangalore
38,49,Female,101.0,Mild,Delhi
84,69,Female,98.0,Strong,Mumbai
14,51,Male,104.0,Mild,Bangalore
73,34,Male,98.0,Strong,Kolkata


In [7]:
y_train.head()    # output Variables 

86    Yes
38    Yes
84     No
14     No
73    Yes
Name: has_covid, dtype: object

# 2.Applying Encoding Schemes Without Colum Trasformation

### 2.1 Filling Null values that are present in fever by SimpleImputer

In [9]:
X_train.isna().sum()   # 8 null values in fever colum of X train 

age       0
gender    0
fever     8
cough     0
city      0
dtype: int64

In [10]:
X_test.isna().sum()   #there are 2 null values in the X test data 

age       0
gender    0
fever     2
cough     0
city      0
dtype: int64

In [16]:
# filing the values By simple Imputer (replace by mean) in X_train colum
si=SimpleImputer()
X_train_fever=si.fit_transform(X_train[['fever']])

## filing the values By simple Imputer (replace by mean) in X_test colum
X_test_fever=si.fit_transform(X_test[['fever']])

In [18]:
X_train_fever.shape   # has 80 values 20 values in test and one colum of fever 

(80, 1)

### 2.2 Ordinal Encoding 

Ordinal encoading on cough colum

In [20]:
df.cough.unique()   # 2 unique values Strong is higher than Mild 

array(['Mild', 'Strong'], dtype=object)

In [24]:
#  Encoding X train Cough  
oe=OrdinalEncoder(categories=[['Mild','Strong']])  # making object of ordinal encoder  with precedence
X_train_cough=oe.fit_transform(X_train[['cough']])
                              
#  Encoding X test Cough                                
X_test_cough=oe.fit_transform(X_test[['cough']])                             

### 2.3 OneHotEncoding 

onehotencoding on gender and city 

In [26]:
# onehotencoding on gender and city On X_train 
ohe=OneHotEncoder(drop="first", sparse=False )# making an object of OnehotEncoder 
X_train_gender_city=ohe.fit_transform(X_train[['gender','city']])

# onehotencoding on gender and city On X_test 
X_test_gender_city=ohe.fit_transform(X_test[['gender','city']])



In [32]:
X_train_gender_city   # ecoded succesfully 

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],


### 2.4Extract Remaining Colum  

The age colums are already in numeric so we extraxt it as it is in an array  

In [30]:
#Extracting age in X_train 
X_train_age=X_train.drop(columns=['gender','fever','cough','city']).values  # .values returns an array 

#Extracting age in X_test
X_test_age=X_test.drop(columns=['gender','fever','cough','city']).values  # .values returns an array 

In [31]:
X_train_age.shape   # 

(80, 1)

All the Input colums Are encoaded 

### 2.5 Concate all the Encoded columns into one array 

In [33]:
#Concate the Train data 
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape  # total we have 7 columns in Xtrain 

(80, 7)

# 3.Encoding With Column Transformation

In [35]:
from sklearn.compose import ColumnTransformer  # inporting column Transformer

In [36]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),  #  tnf1 is the name  of trasnformer 
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')    # remainder is defined as remaining columns it have 2 values 'drop' means drop remaining columns and 'passthrough' means renain that columns as it is  

In [39]:
# Fiting the trasformer model on X train 
X_train_transformed =transformer.fit_transform(X_train)

# Fiting the trasformer model on X test
X_test_transformed =transformer.fit_transform(X_test)




In [41]:
X_train_transformed.shape   # exactly same shape of matrix 

(80, 7)