In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
pd.set_option?


[1;31mSignature:[0m   [0mpd[0m[1;33m.[0m[0mset_option[0m[1;33m([0m[1;33m*[0m[0margs[0m[1;33m,[0m [1;33m**[0m[0mkwds[0m[1;33m)[0m [1;33m->[0m [1;34m'T'[0m[1;33m[0m[1;33m[0m[0m
[1;31mType:[0m        CallableDynamicDoc
[1;31mString form:[0m <pandas._config.config.CallableDynamicDoc object at 0x000002D3F3BB8F50>
[1;31mFile:[0m        c:\users\dsingh\appdata\local\programs\python\python313\lib\site-packages\pandas\_config\config.py
[1;31mDocstring:[0m  
set_option(pat, value)

Sets the value of the specified option.

Available options:

- compute.[use_bottleneck, use_numba, use_numexpr]
- display.[chop_threshold, colheader_justify, date_dayfirst, date_yearfirst,
  encoding, expand_frame_repr, float_format]
- display.html.[border, table_schema, use_mathjax]
- display.[large_repr, max_categories, max_columns, max_colwidth, max_dir_items,
  max_info_columns, max_info_rows, max_rows, max_seq_items, memory_usage,
  min_rows, multi_sparse, notebook_repr_html

In [5]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns',10)
np.set_printoptions(threshold=20, precision=3, suppress=True)


In [6]:
df=pd.read_csv('covid_toy.csv')

In [7]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [9]:
## these are the categorical columns (nominal )
print(df['gender'].value_counts())
print(df['city'].value_counts())

gender
Female    59
Male      41
Name: count, dtype: int64
city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64


In [10]:
## this is the ordinal column
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [11]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2, random_state=0)

In [14]:
X_train

Unnamed: 0,age,gender,fever,cough,city
43,22,Female,99.0,Mild,Bangalore
62,56,Female,104.0,Strong,Bangalore
3,31,Female,98.0,Mild,Kolkata
71,75,Female,104.0,Strong,Delhi
45,72,Male,99.0,Mild,Bangalore
...,...,...,...,...,...
96,51,Female,101.0,Strong,Kolkata
67,65,Male,99.0,Mild,Bangalore
64,42,Male,104.0,Mild,Mumbai
47,18,Female,104.0,Mild,Bangalore


## **1.We Are applying the pevious days step for transforming**

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder 

In [16]:
# adding simple imputer to the fever column to handle the missing values
si=SimpleImputer()

X_train_fever=si.fit_transform(X_train[['fever']])
X_test_fever=si.fit_transform(X_test[['fever']])

In [17]:
X_train_fever

array([[ 99.],
       [104.],
       [ 98.],
       ...,
       [104.],
       [104.],
       [102.]], shape=(80, 1))

**ordinal encoding -> cough column**

In [18]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [19]:

oe= OrdinalEncoder(categories=[['Mild', 'Strong']])

In [20]:
X_train_cough=oe.fit_transform(X_train[['cough']])
#also the test data
X_test_cough=oe.fit_transform(X_test[['cough']])

In [21]:
X_train_cough

array([[0.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]], shape=(80, 1))

**One hot encoding on the city and the gendre column**

In [22]:
ohe=OneHotEncoder(sparse_output=False, drop='first')

X_train_gender_city=ohe.fit_transform(X_train[['gender','city']])
# for the gender and city also in the test data
X_test_gender_city=ohe.fit_transform(X_test[['gender','city']])

In [23]:
X_train_gender_city

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 1.],
       [0., 0., 0., 0.],
       [1., 1., 0., 0.]], shape=(80, 4))

In [26]:
X_train_age=X_train.drop(columns=['fever','cough','gender','city']).values
X_test_age=X_test.drop(columns=['fever','cough','gender','city']).values

In [27]:
X_train_age

array([[22],
       [56],
       [31],
       ...,
       [42],
       [18],
       [20]], shape=(80, 1))

In [28]:
X_train_transformed=np.concatenate([X_train_age, X_train_fever,  X_train_gender_city , X_train_cough],axis=1)
# also the test data
X_test_transformed=np.concatenate([X_test_age, X_test_fever,  X_test_gender_city , X_test_cough],axis=1)

In [29]:
X_train_transformed

array([[ 22.,  99.,   0., ...,   0.,   0.,   0.],
       [ 56., 104.,   0., ...,   0.,   0.,   1.],
       [ 31.,  98.,   0., ...,   1.,   0.,   0.],
       ...,
       [ 42., 104.,   1., ...,   0.,   1.,   0.],
       [ 18., 104.,   0., ...,   0.,   0.,   0.],
       [ 20., 102.,   1., ...,   0.,   0.,   1.]], shape=(80, 7))

### 2. **we are applying the column transformer the does all the steps in one shot**

In [31]:
from sklearn.compose import ColumnTransformer

In [32]:
ct=ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(), ['fever']),
    ('tnf2', OneHotEncoder(sparse_output=False, drop='first'),['gender','city']),
    ('tnf3', OrdinalEncoder(categories=[['Mild', 'Strong']]),['cough'])
],remainder='passthrough')

In [33]:
transformed_X_train=ct.fit_transform(X_train)

In [34]:
transformed_X_train

array([[ 99.,   0.,   0., ...,   0.,   0.,  22.],
       [104.,   0.,   0., ...,   0.,   1.,  56.],
       [ 98.,   0.,   0., ...,   0.,   0.,  31.],
       ...,
       [104.,   1.,   0., ...,   1.,   0.,  42.],
       [104.,   0.,   0., ...,   0.,   0.,  18.],
       [102.,   1.,   1., ...,   0.,   1.,  20.]], shape=(80, 7))

In [35]:
transformed_X_test=ct.fit_transform(X_test)

In [36]:
transformed_X_test

array([[100.,   0.,   0., ...,   0.,   0.,  19.],
       [104.,   1.,   0., ...,   0.,   0.,  25.],
       [101.,   1.,   1., ...,   0.,   0.,  42.],
       ...,
       [100.,   0.,   0., ...,   0.,   1.,  13.],
       [ 98.,   0.,   0., ...,   0.,   0.,  26.],
       [100.,   0.,   0., ...,   0.,   1.,  19.]], shape=(20, 7))

In [None]:
###from sklearn.compose import ColumnTransformer, make_column_selector
###from sklearn.impute import SimpleImputer
###from sklearn.preprocessing import OneHotEncoder,   OrdinalEncoder
###
###ct = ColumnTransformer(
###    transformers=[
###        ('num', SimpleImputer(), make_column_selector(dtype_include='number')),
###        ('cat', OneHotEncoder(sparse_output=False, drop='first'), make_column_selector(dtype_include='object'))
###    ],
###    remainder='passthrough'
###)
