## COVID Data ETL

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

dataFolder = './data/'

# Load CSV data from Netezza export
#d1 = dataFolder+'conposcovidloc.csv'
d1 = dataFolder+'conposcovidloc.csv'

In [2]:
covidDF = pd.read_csv(d1,sep=',',header=0,dtype=object)
covidDF.shape

(306997, 18)

In [3]:
covidDF.head()

Unnamed: 0,Row_ID,Accurate_Episode_Date,Case_Reported_Date,Test_Reported_Date,Specimen_Date,Age_Group,Client_Gender,Case_AcquisitionInfo,Outcome1,Outbreak_Related,Reporting_PHU_ID,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_City,Reporting_PHU_Postal_Code,Reporting_PHU_Website,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,1,2020-06-04,2020-06-06,2020-06-06,2020-06-04,50s,FEMALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.6474713,-79.7088933
1,2,2020-06-01,2020-06-03,2020-06-03,2020-06-02,30s,MALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.6474713,-79.7088933
2,3,2020-05-30,2020-06-02,2020-06-02,2020-05-30,20s,MALE,CC,Resolved,,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.6474713,-79.7088933
3,4,2020-04-15,2020-04-17,2020-04-17,2020-04-16,80s,FEMALE,OB,Resolved,Yes,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.6474713,-79.7088933
4,5,2020-04-18,2020-04-21,2020-04-21,2020-04-18,20s,MALE,OB,Resolved,Yes,2253,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.6474713,-79.7088933


In [4]:
# Convert date column to date type
c = 'Accurate_Episode_Date'
#t = 'Case_Reported_Date'
#covidDF[t]= pd.to_datetime(covidDF[t])
covidDF[c]= pd.to_datetime(covidDF[c])
covidDF.dtypes

Row_ID                               object
Accurate_Episode_Date        datetime64[ns]
Case_Reported_Date                   object
Test_Reported_Date                   object
Specimen_Date                        object
Age_Group                            object
Client_Gender                        object
Case_AcquisitionInfo                 object
Outcome1                             object
Outbreak_Related                     object
Reporting_PHU_ID                     object
Reporting_PHU                        object
Reporting_PHU_Address                object
Reporting_PHU_City                   object
Reporting_PHU_Postal_Code            object
Reporting_PHU_Website                object
Reporting_PHU_Latitude               object
Reporting_PHU_Longitude              object
dtype: object

### Create a data frame for database import

In [5]:
#Create a new data frame

dtDFCols = ['row_id','day','month', 'day_of_week','week_in_year','weekend','holiday','season']

dtDF = pd.DataFrame(columns = dtDFCols) 

dtDF.shape

dtDF.dtypes

row_id          object
day             object
month           object
day_of_week     object
week_in_year    object
weekend         object
holiday         object
season          object
dtype: object

In [6]:
dtDF['row_id'] = covidDF['Row_ID']
dtDF['day'] = covidDF[c].dt.day
dtDF['month'] = covidDF[c].dt.month_name()
dtDF['day_of_week'] = covidDF[c].dt.day_name()
dtDF['week_in_year'] = covidDF[c].dt.week

dtDF.shape

(306997, 8)

In [7]:
# Map Season
seasons = {
             1: 'Winter',
             2: 'Spring',
             3: 'Summer',
             4: 'Autumn'
}
dtDF['season_id'] = covidDF[c].dt.month%12 // 3 + 1
#dtDF['season_id'].fillna(covidDF[t].dt.month%12 // 3 + 1, inplace = True) 

dtDF['season'] = dtDF['season_id'].map(seasons)

In [8]:
del dtDF['season_id']

In [9]:
dtDF['day_id'] = covidDF[c].dt.dayofweek
dtDF['weekend'] = 'no'

dtDF.loc[dtDF['day_id'] > 4, 'weekend'] = 'yes'

del dtDF['day_id']

In [10]:
# holiday
dtDF['holiday'] = 'no'

#Christmas
dtDF.loc[(dtDF['day'] == 25) & (dtDF['month'] == 12), 'holiday'] = 'yes'
#New Year
dtDF.loc[(dtDF['day'] == 1) & (dtDF['month'] == 1), 'holiday'] = 'yes'
#Canada Day
dtDF.loc[(dtDF['day'] == 1) & (dtDF['month'] == 7), 'holiday'] = 'yes'

dtDF.dtypes

row_id          object
day              int64
month           object
day_of_week     object
week_in_year     int64
weekend         object
holiday         object
season          object
dtype: object

In [11]:
dtDF.head(50)

Unnamed: 0,row_id,day,month,day_of_week,week_in_year,weekend,holiday,season
0,1,4,June,Thursday,23,no,no,Summer
1,2,1,June,Monday,23,no,no,Summer
2,3,30,May,Saturday,22,yes,no,Spring
3,4,15,April,Wednesday,16,no,no,Spring
4,5,18,April,Saturday,16,yes,no,Spring
5,6,13,May,Wednesday,20,no,no,Spring
6,7,13,April,Monday,16,no,no,Spring
7,8,18,April,Saturday,16,yes,no,Spring
8,9,25,May,Monday,22,no,no,Spring
9,10,17,June,Wednesday,25,no,no,Summer


In [12]:
# Save Data
dtDF.to_csv(dataFolder+'Onset Date_dimension_CSV.csv',encoding="utf-8",index=False)