# Analysis of COVID-19 confirmed cases in South Korea - 
### to determine which age group and gender was most affected. And trying to find if there is a clear distinction in ‘number of infections’ or ‘death rate’ between different age groups or genders, if there is one.

In [1]:
# Dependencies

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

---------------------------------------------------------------------------------------------------------------

### Store CSV into DataFrame

In [2]:
# Store CSV into DataFrame
csv_file = "Resources/PatientInfo.csv"
patient_info_df = pd.read_csv(csv_file)
patient_info_df.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,2,,2020-01-31,2020-02-24,,released


---------------------------------------------------------------------------------------------------------------

### Create new data with select columns

In [3]:
patient_info_df.columns

Index(['patient_id', 'sex', 'age', 'country', 'province', 'city',
       'infection_case', 'infected_by', 'contact_number', 'symptom_onset_date',
       'confirmed_date', 'released_date', 'deceased_date', 'state'],
      dtype='object')

In [4]:
# new DF with select columns

new_patient_info_df = patient_info_df[['patient_id', 'sex', 'age', 'province', 'city', \
                                       'confirmed_date', 'released_date', 'state']].copy()
new_patient_info_df

Unnamed: 0,patient_id,sex,age,province,city,confirmed_date,released_date,state
0,1000000001,male,50s,Seoul,Gangseo-gu,2020-01-23,2020-02-05,released
1,1000000002,male,30s,Seoul,Jungnang-gu,2020-01-30,2020-03-02,released
2,1000000003,male,50s,Seoul,Jongno-gu,2020-01-30,2020-02-19,released
3,1000000004,male,20s,Seoul,Mapo-gu,2020-01-30,2020-02-15,released
4,1000000005,female,20s,Seoul,Seongbuk-gu,2020-01-31,2020-02-24,released
...,...,...,...,...,...,...,...,...
5160,7000000015,female,30s,Jeju-do,Jeju-do,2020-05-30,2020-06-13,released
5161,7000000016,,,Jeju-do,Jeju-do,2020-06-16,2020-06-24,released
5162,7000000017,,,Jeju-do,Jeju-do,2020-06-18,,isolated
5163,7000000018,,,Jeju-do,Jeju-do,2020-06-18,,isolated


---------------------------------------------------------------------------------------------------------------

### Clean DataFrame

In [5]:
# Replacing 'Nan' values with '0'
new_patient_info_df.fillna(0, inplace=True)

In [6]:
# Checking for null values in DataFrame
new_patient_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5165 entries, 0 to 5164
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   patient_id      5165 non-null   int64 
 1   sex             5165 non-null   object
 2   age             5165 non-null   object
 3   province        5165 non-null   object
 4   city            5165 non-null   object
 5   confirmed_date  5165 non-null   object
 6   released_date   5165 non-null   object
 7   state           5165 non-null   object
dtypes: int64(1), object(7)
memory usage: 322.9+ KB


In [7]:
# Checking for duplicates in a column
new_patient_info_df[new_patient_info_df["patient_id"].duplicated()]

Unnamed: 0,patient_id,sex,age,province,city,confirmed_date,released_date,state
1555,1200012238,female,20s,Daegu,Nam-gu,2020-06-17,0,isolated


In [8]:
# Find original and duplicate record
find = new_patient_info_df[new_patient_info_df['patient_id'].astype(str).str.contains('1200012238')]
find

Unnamed: 0,patient_id,sex,age,province,city,confirmed_date,released_date,state
1547,1200012238,female,20s,Daegu,Icheon-dong,2020-06-17,0,isolated
1555,1200012238,female,20s,Daegu,Nam-gu,2020-06-17,0,isolated


In [9]:
cleaned_info = new_patient_info_df.drop_duplicates(subset = ["patient_id"], keep=False)
cleaned_info

Unnamed: 0,patient_id,sex,age,province,city,confirmed_date,released_date,state
0,1000000001,male,50s,Seoul,Gangseo-gu,2020-01-23,2020-02-05,released
1,1000000002,male,30s,Seoul,Jungnang-gu,2020-01-30,2020-03-02,released
2,1000000003,male,50s,Seoul,Jongno-gu,2020-01-30,2020-02-19,released
3,1000000004,male,20s,Seoul,Mapo-gu,2020-01-30,2020-02-15,released
4,1000000005,female,20s,Seoul,Seongbuk-gu,2020-01-31,2020-02-24,released
...,...,...,...,...,...,...,...,...
5160,7000000015,female,30s,Jeju-do,Jeju-do,2020-05-30,2020-06-13,released
5161,7000000016,0,0,Jeju-do,Jeju-do,2020-06-16,2020-06-24,released
5162,7000000017,0,0,Jeju-do,Jeju-do,2020-06-18,0,isolated
5163,7000000018,0,0,Jeju-do,Jeju-do,2020-06-18,0,isolated


In [None]:
# Replacing 'Nan' values with '0'

# new_patient_info_df.replace(np.nan, 0, inplace=True)
# new_patient_info_df.head(15)

---------------------------------------------------------------------------------------------------------------

In [None]:
# grouped_patients = new_patient_info_df.groupby(['sex', 'age'])
# grouped_patients.count()

---------------------------------------------------------------------------------------------------------------

### Connect to local database

In [10]:
connection_string = "postgres:postgres@localhost:5432/etl_project"
engine = create_engine(f'postgresql://{connection_string}')

In [11]:
cleaned_info.columns

Index(['patient_id', 'sex', 'age', 'province', 'city', 'confirmed_date',
       'released_date', 'state'],
      dtype='object')

In [12]:
# Confirm table(s) in local database
engine.table_names()

['patient_info']

---------------------------------------------------------------------------------------------------------------

### Load DataFrame(s) into database

In [13]:
cleaned_info.to_sql(name='patient_info', con=engine, if_exists='append', index=False)

---------------------------------------------------------------------------------------------------------------