In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta
from sqlalchemy import create_engine

# Setup Steps

In [2]:
# Set where .csv files needs to be saved
base_output_path = "../../data"

# Create a POSTGRES database with the name 'COVID19_db'
# Replace username:password if it's not set to postgres:postgres
DATABASE_URI = os.environ.get('DATABASE_URL', '') or "postgresql://postgres:postgres@localhost:5432/COVID19_db"
print(DATABASE_URI)

engine = create_engine(DATABASE_URI)

postgresql://postgres:postgres@localhost:5432/COVID19_db


In [3]:
# Set URL's

# REALTIME DOWNLOADS
url_covid = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'
url_covid_states = 'https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/states_daily_4pm_et.csv'

# Download latest Kaggle data manually: https://www.kaggle.com/sudalairajkumar/covid19-in-usa#us_states_covid19_daily.csv
url_tests_and_hospital = os.path.join(base_output_path, 'us_states_covid19_daily.csv')

In [4]:
# Find last time covid_data was loaded
query_str = open('sql/max_db_date.sql')
query_text = ""
for text in query_str:
    query_text = query_text + text

rs = engine.execute(query_text)

for i in rs:
    last_db_date = (i[0])
print(last_db_date)

2020-04-01


# Number of tests performed and people hospitalized per state

In [48]:
# Test data statistics
df_covid_states = pd.DataFrame()
df_covid_states = pd.read_csv(url_covid_states).fillna(0)

df_covid_states['date'] = pd.to_datetime(df_covid_states['date'],format='%Y%m%d')
df_covid_states['dateChecked'] = pd.to_datetime(df_covid_states['dateChecked']).dt.date
df_covid_states = df_covid_states.drop(['hash'], axis=1)
df_covid_states = df_covid_states.rename(columns = {
        'hospitalizedCurrently':'hospitalized_curr','hospitalizedCumulative':'hospitalized_cum',
        'inIcuCurrently':'in_ICU_curr','inIcuCumulative':'in_ICU_cum',
        'onVentilatorCurrently':'on_vent_curr','onVentilatorCumulative':'on_vent_cum',
        'total':'dontuse_total','dateChecked':'check_date','totalTestResults':'total_tests','posNeg':'pos_neg','deathIncrease':'death_inc',
        'hospitalizedIncrease':'hospital_inc','negativeIncrease':'neg_inc','positiveIncrease':'pos_inc',
        'totalTestResultsIncrease':'tot_tests_inc'})
df_covid_states.head()

Unnamed: 0,date,state,positive,negative,pending,hospitalized_curr,hospitalized_cum,in_ICU_curr,in_ICU_cum,on_vent_curr,...,hospitalized,dontuse_total,total_tests,pos_neg,fips,death_inc,hospital_inc,neg_inc,pos_inc,tot_tests_inc
0,2020-04-02,AK,143.0,4879.0,0.0,0.0,9.0,0.0,0.0,0.0,...,9.0,5022,5022,5022,2,0.0,0.0,409.0,10.0,419.0
1,2020-04-02,AL,1233.0,7503.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8736,8736,8736,1,6.0,0.0,806.0,156.0,962.0
2,2020-04-02,AR,643.0,7880.0,0.0,66.0,0.0,0.0,0.0,23.0,...,0.0,8523,8523,8523,5,2.0,0.0,526.0,59.0,585.0
3,2020-04-02,AS,0.0,20.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,26,20,20,60,0.0,0.0,0.0,0.0,0.0
4,2020-04-02,AZ,1598.0,21111.0,0.0,0.0,228.0,0.0,83.0,0.0,...,228.0,22709,22709,22709,4,3.0,79.0,1466.0,185.0,1651.0


In [49]:
df_covid_states.columns

Index(['date', 'state', 'positive', 'negative', 'pending', 'hospitalized_curr',
       'hospitalized_cum', 'in_ICU_curr', 'in_ICU_cum', 'on_vent_curr',
       'on_vent_cum', 'recovered', 'check_date', 'death', 'hospitalized',
       'dontuse_total', 'total_tests', 'pos_neg', 'fips', 'death_inc',
       'hospital_inc', 'neg_inc', 'pos_inc', 'tot_tests_inc'],
      dtype='object')

In [50]:
engine.execute( '''TRUNCATE TABLE covid_data_states''' )
df_covid_states.to_sql(name='covid_data_states', con=engine, if_exists='append', index=False)

# COVID-19 Data from Johns Hopkins

In [None]:
# Read data into Dataframes
df_covid_phase0 = pd.DataFrame()
df_covid_phase1 = pd.DataFrame()
df_covid_phase2 = pd.DataFrame()
df_covid_phase3 = pd.DataFrame()
df_covid_phase4 = pd.DataFrame() 

# Covid data starts at 01-22-2020.csv save in format 2020-02-01 - LOOP over all / just new ones
# Date format has changed 3 times so date ranges to be processed:
# Phase 0: 2020-01-22 - 2020-01-31 (Country/Region = Country)
# Phase 1: 2020-02-01 - 2020-02-29 (Country/Region = Country, Province/State=US County)
# Phase 2: 2020-03-01 - 2020-03-09 (Country/Region = Country, Province/State=US County, NEW: Lat Long)
# Phase 3: 2020-03-10 - 2020-03-21 (Country/Region = Country, Province/State=STATE!!!!! - no more county level)
# Phase 4: 2020-03-22 - onwards (NEW: Admin2 (=County), Active)

# Reset last_db_date only for inital load. Then leave it to setting at start
# last_db_date = datetime.strptime('01-22-2020',"%m-%d-%Y").date()
yesterday_date = datetime.today().date() + timedelta(days=-1)
loop_date = last_db_date + timedelta(days=+1)

print("Last Date in DB:", last_db_date)
print("Yesterday's Date:", yesterday_date)

while loop_date <= yesterday_date:
    
    # Change to date format used in .csv files
    file_date = datetime.strftime(loop_date, "%m-%d-%Y")
    
    # Create URL and get data
    url_covid_file = url_covid + file_date + ".csv"
    print("Now processing: " + url_covid_file)
    df_covid_loop = pd.read_csv(url_covid_file)
    df_covid_loop['short_date'] = loop_date
    
    # Create output path for each file and save file locally then append to overall dataframe
    output_path = os.path.join(base_output_path, "df_covid_" + file_date + ".csv")
    df_covid_loop.to_csv(output_path)
    
    # Split 5 different formats based on date
    if loop_date < datetime.strptime('02-01-2020',"%m-%d-%Y").date(): 
        df_covid_phase0 = df_covid_phase0.append(df_covid_loop)
    elif loop_date <= datetime.strptime('02-29-2020',"%m-%d-%Y").date(): 
        df_covid_phase1 = df_covid_phase1.append(df_covid_loop)
    elif loop_date < datetime.strptime('03-10-2020',"%m-%d-%Y").date():
        df_covid_phase2 = df_covid_phase2.append(df_covid_loop)
    elif loop_date < datetime.strptime('03-22-2020',"%m-%d-%Y").date():
        df_covid_phase3 = df_covid_phase3.append(df_covid_loop)
    else:
        df_covid_phase4 = df_covid_phase4.append(df_covid_loop)
        
        
    # Get the next file
    loop_date = loop_date + timedelta(days=1)

In [None]:
df_covid_phase4.head()

In [None]:
df_covid_phase4.fillna(0, inplace=True)
df_covid_phase4['Confirmed'] = df_covid_phase4['Confirmed'].astype(int)
df_covid_phase4['Deaths'] = df_covid_phase4['Deaths'].astype(int)
df_covid_phase4['Recovered'] = df_covid_phase4['Recovered'].astype(int)
df_covid_phase4['Active'] = df_covid_phase4['Active'].astype(int)
df_covid_phase4 = df_covid_phase4.rename(columns = {'Province_State':'province_state','Country_Region':'country_region', 'Confirmed':'confirmed','Deaths':'deaths','Active':'active','Recovered':'recovered','Lat':'latitude','Long_':'longitude','Admin2':'us_county'})

df_covid_4 = df_covid_phase4[['province_state','country_region','confirmed','deaths','recovered','active','short_date','latitude','longitude','us_county']]

df_covid_4.head()     

In [None]:
# engine.execute( '''TRUNCATE TABLE covid_data_phase4''' )        
df_covid_4.to_sql(name='covid_data_4', con=engine, if_exists='append', index=False) 