In [38]:
import numpy as np
import pandas as pd

In [39]:
df = pd.read_csv("covid_toy.csv")

In [40]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [41]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [42]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [43]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'])

In [44]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
1,27,Male,100.0,Mild,Delhi
0,60,Male,103.0,Mild,Kolkata
43,22,Female,99.0,Mild,Bangalore
27,33,Female,102.0,Strong,Delhi
71,75,Female,104.0,Strong,Delhi


# 1. Aam Zindagi

In [45]:
si = SimpleImputer()

X_train_imputed = si.fit_transform(X_train[['fever']])
X_test_imputed = si.transform(X_test[['fever']])

X_train_imputed = pd.DataFrame(X_train_imputed, columns=['fever'])
X_test_imputed = pd.DataFrame(X_test_imputed, columns=['fever'])

In [47]:
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_ord = pd.DataFrame(oe.fit_transform(X_train[['cough']]),columns=['cough'])
X_test_ord = pd.DataFrame(oe.transform(X_test[['cough']]),columns = ['cough'])


In [50]:
ohe = OneHotEncoder(drop='first',sparse_output=False)

X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train[['gender','city']]),columns=ohe.get_feature_names_out(['gender','city']))
X_test_ohe = pd.DataFrame(ohe.transform(X_test[['gender','city']]),columns=ohe.get_feature_names_out(['gender','city']))



In [54]:
X_train_final = pd.concat([X_train_imputed,X_train_ord,X_train_ohe],axis=1)
X_test_final = pd.concat([X_test_imputed,X_test_ord,X_test_ohe],axis=1)

In [56]:
X_train_final

Unnamed: 0,fever,cough,gender_Male,city_Delhi,city_Kolkata,city_Mumbai
0,100.000000,0.0,1.0,1.0,0.0,0.0
1,103.000000,0.0,1.0,0.0,1.0,0.0
2,99.000000,0.0,0.0,0.0,0.0,0.0
3,102.000000,1.0,0.0,1.0,0.0,0.0
4,104.000000,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
70,99.000000,0.0,0.0,0.0,0.0,1.0
71,100.884058,0.0,1.0,0.0,0.0,1.0
72,98.000000,0.0,0.0,0.0,0.0,1.0
73,102.000000,0.0,1.0,0.0,1.0,0.0


# Mentos Zindagi

In [61]:
from sklearn.compose import ColumnTransformer

In [62]:
df = pd.read_csv('covid_toy.csv')

In [63]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["has_covid"]), df["has_covid"]
)

In [64]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
78,11,Male,100.0,Mild,Bangalore
27,33,Female,102.0,Strong,Delhi
15,70,Male,103.0,Strong,Kolkata
43,22,Female,99.0,Mild,Bangalore
20,12,Male,98.0,Strong,Bangalore


In [68]:
transformer = ColumnTransformer(
    transformers=[
        ("tnf1", SimpleImputer(), ["fever"]),
        ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ["cough"]),
        ("tnf3", OneHotEncoder(sparse_output=False, drop="first"), ["gender", "city"]),
    ],
    remainder="passthrough",
)

In [76]:
X_train_ct = pd.DataFrame(transformer.fit_transform(X_train),columns = transformer.get_feature_names_out())
X_test_ct = pd.DataFrame(transformer.fit_transform(X_test),columns = transformer.get_feature_names_out())

In [73]:
X_train.columns

Index(['age', 'gender', 'fever', 'cough', 'city'], dtype='object')

In [77]:
X_test_ct

Unnamed: 0,tnf1__fever,tnf2__cough,tnf3__gender_Male,tnf3__city_Delhi,tnf3__city_Kolkata,tnf3__city_Mumbai,remainder__age
0,98.0,1.0,0.0,0.0,0.0,1.0,69.0
1,101.0,0.0,0.0,1.0,0.0,0.0,49.0
2,100.0,0.0,0.0,0.0,1.0,0.0,19.0
3,103.0,0.0,0.0,0.0,1.0,0.0,50.0
4,98.0,1.0,0.0,0.0,0.0,1.0,5.0
5,104.0,1.0,0.0,0.0,1.0,0.0,54.0
6,100.0,0.0,1.0,0.0,1.0,0.0,55.0
7,101.0,0.0,1.0,1.0,0.0,0.0,15.0
8,98.0,0.0,1.0,0.0,1.0,0.0,24.0
9,100.0,0.0,1.0,0.0,0.0,0.0,80.0
