# **Preprocessing Notebook**

This notebook handles:


 1.Load and clean Dataset

 2.Handle missing values

 3.Drop unnecessary columns




# 1.Load and Clean Dataset




Import Libraries

In [1]:
import pandas as pd
import numpy as np

Load Dataset

In [2]:
df = pd.read_csv("../data/Covid-19 Dataset.csv")
df.head(15)


Unnamed: 0,Patient_ID,Age,Gender,Region,Preexisting_Condition,Date_of_Infection,COVID_Strain,Symptoms,Severity,Hospitalized,...,Date_of_Reinfection,Vaccination_Status,Vaccine_Type,Doses_Received,Date_of_Last_Dose,Long_COVID_Symptoms,Occupation,Smoking_Status,BMI,Recovery_Classification
0,1,69,Male,Hovedstaden,Obesity,2022-06-21,Delta,Mild,Moderate,Yes,...,2022-12-15,Yes,,1,2022-09-22,,Healthcare,Never,27.7,Delayed Recovery
1,2,38,Male,Sjælland,Asthma,2024-02-02,XBB.1.5,Mild,Moderate,No,...,2024-06-08,No,,0,2023-08-21,,Healthcare,Never,21.9,Typical Recovery
2,3,41,Female,Syddanmark,Hypertension,2023-05-28,Beta,Mild,High,Yes,...,2023-12-19,Yes,Janssen,3,2024-05-14,,Unemployed,Never,22.7,Delayed Recovery
3,4,81,Female,Hovedstaden,Asthma,2023-08-13,Delta,Severe,High,No,...,2024-08-24,Yes,AstraZeneca,1,2024-10-31,,Office Worker,Never,27.7,Delayed Recovery
4,5,50,Female,Syddanmark,Cardiovascular,2023-03-10,Delta,Mild,High,No,...,2023-09-08,Yes,,2,2023-07-05,,Student,Never,11.9,Delayed Recovery
5,6,66,Male,Sjælland,Cardiovascular,2022-07-04,Omicron,Moderate,Moderate,No,...,2023-02-22,Yes,AstraZeneca,3,2025-03-07,,Healthcare,Never,29.8,Delayed Recovery
6,7,76,Female,Sjælland,Obesity,2023-04-30,Omicron,Moderate,Critical,Yes,...,2023-08-04,No,,0,2022-10-12,,Unemployed,Former,22.3,Delayed Recovery
7,8,77,Female,Sjælland,Diabetes,2024-03-25,XBB.1.5,Moderate,Low,No,...,2024-09-02,Yes,Janssen,3,2025-05-08,,Driver,Former,24.4,Delayed Recovery
8,9,79,Female,Nordjylland,Hypertension,2023-11-03,XBB.1.5,Mild,Low,No,...,2024-05-13,No,,0,2023-09-01,,Healthcare,Former,26.1,Typical Recovery
9,10,72,Female,Sjælland,Cardiovascular,2023-01-11,Alpha,Severe,High,No,...,2023-06-13,Yes,AstraZeneca,1,2024-08-21,,Unemployed,Current,21.2,Delayed Recovery


Check informations on the dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Patient_ID               3000 non-null   int64  
 1   Age                      3000 non-null   int64  
 2   Gender                   3000 non-null   object 
 3   Region                   3000 non-null   object 
 4   Preexisting_Condition    2531 non-null   object 
 5   Date_of_Infection        3000 non-null   object 
 6   COVID_Strain             3000 non-null   object 
 7   Symptoms                 3000 non-null   object 
 8   Severity                 3000 non-null   object 
 9   Hospitalized             3000 non-null   object 
 10  Hospital_Admission_Date  2850 non-null   object 
 11  Hospital_Discharge_Date  2850 non-null   object 
 12  ICU_Admission            3000 non-null   object 
 13  Ventilator_Support       3000 non-null   object 
 14  Recovered               

In [4]:
df.columns

Index(['Patient_ID', 'Age', 'Gender', 'Region', 'Preexisting_Condition',
       'Date_of_Infection', 'COVID_Strain', 'Symptoms', 'Severity',
       'Hospitalized', 'Hospital_Admission_Date', 'Hospital_Discharge_Date',
       'ICU_Admission', 'Ventilator_Support', 'Recovered', 'Date_of_Recovery',
       'Reinfection', 'Date_of_Reinfection', 'Vaccination_Status',
       'Vaccine_Type', 'Doses_Received', 'Date_of_Last_Dose',
       'Long_COVID_Symptoms', 'Occupation', 'Smoking_Status', 'BMI',
       'Recovery_Classification'],
      dtype='object')

In [5]:
df.shape

(3000, 27)

In [6]:
df.describe()

Unnamed: 0,Patient_ID,Age,Doses_Received,BMI
count,3000.0,3000.0,3000.0,3000.0
mean,1500.5,53.944,0.981667,25.0965
std,866.169729,20.872919,1.154025,4.898435
min,1.0,18.0,0.0,10.2
25%,750.75,36.0,0.0,21.8
50%,1500.5,54.0,0.0,25.1
75%,2250.25,72.0,2.0,28.5
max,3000.0,89.0,3.0,44.6


Check for duplicate values

In [7]:
df.duplicated().sum()


0

# 2. Handle missing values

In [8]:
df.isnull().sum()

Patient_ID                    0
Age                           0
Gender                        0
Region                        0
Preexisting_Condition       469
Date_of_Infection             0
COVID_Strain                  0
Symptoms                      0
Severity                      0
Hospitalized                  0
Hospital_Admission_Date     150
Hospital_Discharge_Date     150
ICU_Admission                 0
Ventilator_Support            0
Recovered                     0
Date_of_Recovery            150
Reinfection                   0
Date_of_Reinfection         150
Vaccination_Status            0
Vaccine_Type               1809
Doses_Received                0
Date_of_Last_Dose           150
Long_COVID_Symptoms        2780
Occupation                    0
Smoking_Status                0
BMI                           0
Recovery_Classification     150
dtype: int64

In [9]:
df.drop(['Vaccine_Type','Long_COVID_Symptoms'],axis=1,inplace=True)


In [10]:
df.isnull().sum()

Patient_ID                   0
Age                          0
Gender                       0
Region                       0
Preexisting_Condition      469
Date_of_Infection            0
COVID_Strain                 0
Symptoms                     0
Severity                     0
Hospitalized                 0
Hospital_Admission_Date    150
Hospital_Discharge_Date    150
ICU_Admission                0
Ventilator_Support           0
Recovered                    0
Date_of_Recovery           150
Reinfection                  0
Date_of_Reinfection        150
Vaccination_Status           0
Doses_Received               0
Date_of_Last_Dose          150
Occupation                   0
Smoking_Status               0
BMI                          0
Recovery_Classification    150
dtype: int64

In [11]:
cols = ['Preexisting_Condition','Hospital_Admission_Date','Hospital_Discharge_Date','Date_of_Recovery','Date_of_Reinfection','Date_of_Last_Dose','Recovery_Classification']
df[cols] = df[cols].apply(lambda col: col.fillna(col.mode()[0]))


In [12]:
df.isnull().sum()

Patient_ID                 0
Age                        0
Gender                     0
Region                     0
Preexisting_Condition      0
Date_of_Infection          0
COVID_Strain               0
Symptoms                   0
Severity                   0
Hospitalized               0
Hospital_Admission_Date    0
Hospital_Discharge_Date    0
ICU_Admission              0
Ventilator_Support         0
Recovered                  0
Date_of_Recovery           0
Reinfection                0
Date_of_Reinfection        0
Vaccination_Status         0
Doses_Received             0
Date_of_Last_Dose          0
Occupation                 0
Smoking_Status             0
BMI                        0
Recovery_Classification    0
dtype: int64

# 3. Drop unnecessary columns

In [13]:
date_columns = [col for col in df.columns if 'Date' in col]


In [14]:
df = df.drop(columns=date_columns + ['Patient_ID'], axis=1)


In [15]:
df.to_csv("../data/Cleaned_Data.csv", index=False)