In [16]:
# Dependencies

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

---------------------------------------------------------------------------------------------------------------

### Store CSV into DataFrame

In [17]:
# Store CSV into DataFrame
csv_file = "Resources/PatientInfo.csv"
patient_info_df = pd.read_csv(csv_file)
patient_info_df.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,2,,2020-01-31,2020-02-24,,released


---------------------------------------------------------------------------------------------------------------

### Create new data with select columns

In [18]:
patient_info_df.columns

Index(['patient_id', 'sex', 'age', 'country', 'province', 'city',
       'infection_case', 'infected_by', 'contact_number', 'symptom_onset_date',
       'confirmed_date', 'released_date', 'deceased_date', 'state'],
      dtype='object')

In [19]:
# new DF with select columns

new_patient_info_df = patient_info_df[['patient_id', 'sex', 'age', 'province', 'city', \
                                       'confirmed_date', 'released_date', 'state']].copy()
new_patient_info_df.head(15)

Unnamed: 0,patient_id,sex,age,province,city,confirmed_date,released_date,state
0,1000000001,male,50s,Seoul,Gangseo-gu,2020-01-23,2020-02-05,released
1,1000000002,male,30s,Seoul,Jungnang-gu,2020-01-30,2020-03-02,released
2,1000000003,male,50s,Seoul,Jongno-gu,2020-01-30,2020-02-19,released
3,1000000004,male,20s,Seoul,Mapo-gu,2020-01-30,2020-02-15,released
4,1000000005,female,20s,Seoul,Seongbuk-gu,2020-01-31,2020-02-24,released
5,1000000006,female,50s,Seoul,Jongno-gu,2020-01-31,2020-02-19,released
6,1000000007,male,20s,Seoul,Jongno-gu,2020-01-31,2020-02-10,released
7,1000000008,male,20s,Seoul,etc,2020-02-02,2020-02-24,released
8,1000000009,male,30s,Seoul,Songpa-gu,2020-02-05,2020-02-21,released
9,1000000010,female,60s,Seoul,Seongbuk-gu,2020-02-05,2020-02-29,released


---------------------------------------------------------------------------------------------------------------

### Clean DataFrame

In [20]:
# Checking for null values in DataFrame
new_patient_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5165 entries, 0 to 5164
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   patient_id      5165 non-null   int64 
 1   sex             4043 non-null   object
 2   age             3785 non-null   object
 3   province        5165 non-null   object
 4   city            5071 non-null   object
 5   confirmed_date  5162 non-null   object
 6   released_date   1587 non-null   object
 7   state           5165 non-null   object
dtypes: int64(1), object(7)
memory usage: 322.9+ KB


In [21]:
# Replacing 'Nan' values with '0'
new_patient_info_df.replace(np.nan, 0, inplace=True)
new_patient_info_df.head(15)

Unnamed: 0,patient_id,sex,age,province,city,confirmed_date,released_date,state
0,1000000001,male,50s,Seoul,Gangseo-gu,2020-01-23,2020-02-05,released
1,1000000002,male,30s,Seoul,Jungnang-gu,2020-01-30,2020-03-02,released
2,1000000003,male,50s,Seoul,Jongno-gu,2020-01-30,2020-02-19,released
3,1000000004,male,20s,Seoul,Mapo-gu,2020-01-30,2020-02-15,released
4,1000000005,female,20s,Seoul,Seongbuk-gu,2020-01-31,2020-02-24,released
5,1000000006,female,50s,Seoul,Jongno-gu,2020-01-31,2020-02-19,released
6,1000000007,male,20s,Seoul,Jongno-gu,2020-01-31,2020-02-10,released
7,1000000008,male,20s,Seoul,etc,2020-02-02,2020-02-24,released
8,1000000009,male,30s,Seoul,Songpa-gu,2020-02-05,2020-02-21,released
9,1000000010,female,60s,Seoul,Seongbuk-gu,2020-02-05,2020-02-29,released


---------------------------------------------------------------------------------------------------------------

In [22]:
# grouped_patients = new_patient_info_df.groupby(['sex', 'age'])
# grouped_patients.count()

---------------------------------------------------------------------------------------------------------------

### Connect to local database

In [23]:
connection_string = "postgres:postgres@localhost:5432/etl_project"
engine = create_engine(f'postgresql://{connection_string}')

In [24]:
new_patient_info_df.columns

Index(['patient_id', 'sex', 'age', 'province', 'city', 'confirmed_date',
       'released_date', 'state'],
      dtype='object')

In [25]:
# Confirm table(s) in local database
engine.table_names()

['patient_info']

---------------------------------------------------------------------------------------------------------------

### Load DataFrame(s) into database

In [26]:
new_patient_info_df.to_sql(name='patient_info', con=engine, if_exists='append', index=True)

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "index" of relation "patient_info" does not exist
LINE 1: INSERT INTO patient_info (index, patient_id, sex, age, provi...
                                  ^

[SQL: INSERT INTO patient_info (index, patient_id, sex, age, province, city, confirmed_date, released_date, state) VALUES (%(index)s, %(patient_id)s, %(sex)s, %(age)s, %(province)s, %(city)s, %(confirmed_date)s, %(released_date)s, %(state)s)]
[parameters: ({'index': 0, 'patient_id': 1000000001, 'sex': 'male', 'age': '50s', 'province': 'Seoul', 'city': 'Gangseo-gu', 'confirmed_date': '2020-01-23', 'released_date': '2020-02-05', 'state': 'released'}, {'index': 1, 'patient_id': 1000000002, 'sex': 'male', 'age': '30s', 'province': 'Seoul', 'city': 'Jungnang-gu', 'confirmed_date': '2020-01-30', 'released_date': '2020-03-02', 'state': 'released'}, {'index': 2, 'patient_id': 1000000003, 'sex': 'male', 'age': '50s', 'province': 'Seoul', 'city': 'Jongno-gu', 'confirmed_date': '2020-01-30', 'released_date': '2020-02-19', 'state': 'released'}, {'index': 3, 'patient_id': 1000000004, 'sex': 'male', 'age': '20s', 'province': 'Seoul', 'city': 'Mapo-gu', 'confirmed_date': '2020-01-30', 'released_date': '2020-02-15', 'state': 'released'}, {'index': 4, 'patient_id': 1000000005, 'sex': 'female', 'age': '20s', 'province': 'Seoul', 'city': 'Seongbuk-gu', 'confirmed_date': '2020-01-31', 'released_date': '2020-02-24', 'state': 'released'}, {'index': 5, 'patient_id': 1000000006, 'sex': 'female', 'age': '50s', 'province': 'Seoul', 'city': 'Jongno-gu', 'confirmed_date': '2020-01-31', 'released_date': '2020-02-19', 'state': 'released'}, {'index': 6, 'patient_id': 1000000007, 'sex': 'male', 'age': '20s', 'province': 'Seoul', 'city': 'Jongno-gu', 'confirmed_date': '2020-01-31', 'released_date': '2020-02-10', 'state': 'released'}, {'index': 7, 'patient_id': 1000000008, 'sex': 'male', 'age': '20s', 'province': 'Seoul', 'city': 'etc', 'confirmed_date': '2020-02-02', 'released_date': '2020-02-24', 'state': 'released'}  ... displaying 10 of 5165 total bound parameter sets ...  {'index': 5163, 'patient_id': 7000000018, 'sex': 0, 'age': 0, 'province': 'Jeju-do', 'city': 'Jeju-do', 'confirmed_date': '2020-06-18', 'released_date': 0, 'state': 'isolated'}, {'index': 5164, 'patient_id': 7000000019, 'sex': 0, 'age': 0, 'province': 'Jeju-do', 'city': 'Jeju-do', 'confirmed_date': '2020-06-18', 'released_date': 0, 'state': 'isolated'})]
(Background on this error at: http://sqlalche.me/e/f405)