In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [13]:
df = pd.read_csv("covid_toy.csv")
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [14]:
df.shape

(100, 6)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [16]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [17]:
df.tail()

Unnamed: 0,age,gender,fever,cough,city,has_covid
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No
99,10,Female,98.0,Strong,Kolkata,Yes


In [19]:
df.sample()

Unnamed: 0,age,gender,fever,cough,city,has_covid
90,59,Female,99.0,Strong,Delhi,No


In [20]:
df.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [21]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [54]:
x = df.iloc[:, 0:5]
y = df.iloc[:, -1:]

In [55]:
x

Unnamed: 0,age,gender,fever,cough,city
0,60,Male,103.0,Mild,Kolkata
1,27,Male,100.0,Mild,Delhi
2,42,Male,101.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai
...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore
96,51,Female,101.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
98,5,Female,98.0,Strong,Mumbai


In [56]:
y

Unnamed: 0,has_covid
0,No
1,Yes
2,No
3,No
4,No
...,...
95,No
96,Yes
97,No
98,No


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test,y_train,y_test = train_test_split (x,y, test_size = 0.2)

In [27]:
x_train

Unnamed: 0,gender,fever,cough,city,has_covid
97,Female,101.0,Mild,Bangalore,No
86,Male,104.0,Mild,Bangalore,Yes
32,Female,101.0,Strong,Delhi,Yes
39,Female,103.0,Mild,Kolkata,No
0,Male,103.0,Mild,Kolkata,No
...,...,...,...,...,...
14,Male,104.0,Mild,Bangalore,No
19,Female,,Strong,Bangalore,Yes
78,Male,100.0,Mild,Bangalore,Yes
9,Female,101.0,Mild,Delhi,No


In [28]:
x_test

Unnamed: 0,gender,fever,cough,city,has_covid
91,Male,,Mild,Delhi,Yes
31,Male,103.0,Mild,Kolkata,No
25,Male,,Mild,Mumbai,No
47,Female,104.0,Mild,Bangalore,No
58,Male,98.0,Strong,Mumbai,Yes
66,Male,104.0,Mild,Kolkata,No
4,Female,101.0,Mild,Mumbai,No
70,Female,101.0,Strong,Delhi,No
57,Female,99.0,Strong,Bangalore,No
49,Male,104.0,Mild,Mumbai,No


In [29]:
y_train

Unnamed: 0,age
97,20
86,25
32,34
39,50
0,60
...,...
14,51
19,42
78,11
9,64


In [30]:
y_test

Unnamed: 0,age
91,38
31,83
25,23
47,18
58,23
66,51
4,65
70,68
57,49
49,44


In [31]:
ct = SimpleImputer()
x_train_fever = ct.fit_transform(x_train[['fever']])

# also the test data
x_test_fever = ct.fit_transform(x_test[['fever']])
                                 
x_train_fever.shape


(80, 1)

In [32]:
x_train_fever

array([[101.        ],
       [104.        ],
       [101.        ],
       [103.        ],
       [103.        ],
       [103.        ],
       [ 99.        ],
       [100.        ],
       [100.        ],
       [100.        ],
       [101.        ],
       [100.73611111],
       [100.        ],
       [103.        ],
       [ 98.        ],
       [100.        ],
       [100.73611111],
       [104.        ],
       [ 98.        ],
       [104.        ],
       [100.        ],
       [ 98.        ],
       [101.        ],
       [101.        ],
       [ 99.        ],
       [102.        ],
       [101.        ],
       [100.        ],
       [100.        ],
       [101.        ],
       [100.73611111],
       [ 98.        ],
       [104.        ],
       [ 98.        ],
       [103.        ],
       [101.        ],
       [ 98.        ],
       [ 98.        ],
       [100.        ],
       [104.        ],
       [103.        ],
       [104.        ],
       [ 99.        ],
       [100

In [33]:
oe = OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])
x_test_cough = oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

In [34]:
x_train_cough

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],

In [36]:
ohe = OneHotEncoder(drop='first',sparse_output=False)
x_train_gender_city = ohe.fit_transform(x_train[['gender','city']])
x_test_gender_city = ohe.fit_transform(x_test[['gender','city']])

x_train_gender_city.shape

(80, 4)

In [37]:
x_train_gender_city

array([[0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],


In [65]:
x_train_age = x_train.drop(columns=['gender','fever','cough','city']).values

x_test_age = x_test.drop(columns=['gender','fever','cough','city']).values

x_train_age.shape

(80, 1)

In [59]:
from sklearn.compose import ColumnTransformer

In [66]:
transformers = ColumnTransformer(transformers = [
    ('tnf1' = SimpleImputer(), ['fever']),
    ('tnf2' = OrdinalEncoder(),(categories= [['Mild', 'Strong']],['cough']),
    ('tnf3' = OneHotEncoder(), (sparse_output = False,drop = 'first'),['gender','city'])],
remainder='passthrough')

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' on line 3 (66628583.py, line 4)

In [67]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [68]:
transformer

In [70]:
transformer.fit_transform(x_train).shape

(80, 7)

In [73]:
transformer.transform(x_test).shape

(20, 7)