In [2]:
# Step 4: Import the required packages and read data from patient.csv in a
# DataFrame

import pandas as pd
import numpy as np

data = pd.read_csv('patient.csv')
data.head()

Unnamed: 0,id,sex,birth_year,country,region,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
0,1,female,1984.0,China,filtered at airport,,visit to Wuhan,1.0,,45.0,2020-01-20,2020-02-06,,released
1,2,male,1964.0,Korea,filtered at airport,,visit to Wuhan,1.0,,75.0,2020-01-24,2020-02-05,,released
2,3,male,1966.0,Korea,capital area,,visit to Wuhan,1.0,,16.0,2020-01-26,2020-02-12,,released
3,4,male,1964.0,Korea,capital area,,visit to Wuhan,1.0,,95.0,2020-01-27,2020-02-09,,released
4,5,male,1987.0,Korea,capital area,,visit to Wuhan,1.0,,31.0,2020-01-30,,,isolated


In [3]:
# Step 5: Take a look at the percentage of null values in each column

(data.isnull().sum())/(data.shape[0])

id                  0.000000
sex                 0.924501
birth_year          0.930674
country             0.000000
region              0.927588
group               0.981956
infection_reason    0.969136
infection_order     0.991690
infected_by         0.985280
contact_number      0.992403
confirmed_date      0.000000
released_date       0.993352
deceased_date       0.996914
state               0.000000
dtype: float64

In [4]:
# Step 6: Replace every occurrence of 0, empty string and NULL with np.nan

data.replace(to_replace=['0',' ','NULL'],value=np.nan, inplace=True)

In [5]:
# Step 7: Extract all numeric data and check the amount of null values

numeric_data = data.select_dtypes(exclude=['object'])
numeric_data.isnull().sum()

id                    0
birth_year         3920
infection_order    4177
infected_by        4150
contact_number     4180
dtype: int64

In [6]:
# Step 8: Drop every row with null values and check the shape of data after that

not_na_data=numeric_data.dropna()
not_na_data.shape

(15, 5)

In [7]:
# Step 9: Drop every column with null values and check the shape of data after that

numeric_data.dropna(axis=1).shape

(4212, 1)

In [8]:
# Step 10: Fill every null value with 0 and take a look at the head of data

numeric_data.fillna(0).head()

Unnamed: 0,id,birth_year,infection_order,infected_by,contact_number
0,1,1984.0,1.0,0.0,45.0
1,2,1964.0,1.0,0.0,75.0
2,3,1966.0,1.0,0.0,16.0
3,4,1964.0,1.0,0.0,95.0
4,5,1987.0,1.0,0.0,31.0


In [9]:
# Step 11: Fill every null value with mean of that column and take a look at the
# number of null values after that

mean_filled = numeric_data.fillna(numeric_data.mean())
mean_filled.isnull().sum()

id                 0
birth_year         0
infection_order    0
infected_by        0
contact_number     0
dtype: int64