In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta
from sqlalchemy import create_engine

# Setup Steps

In [2]:
# Set where .csv files needs to be saved
base_output_path = "../../data"

# Create a POSTGRES database with the name 'COVID19_db'
# Replace username:password if it's not set to postgres:postgres
DATABASE_URI = os.environ.get('DATABASE_URL', '') or "postgresql://postgres:postgres@localhost:5432/COVID19_db"
print(DATABASE_URI)

engine = create_engine(DATABASE_URI)

postgresql://postgres:postgres@localhost:5432/COVID19_db


In [3]:
# Set URL's

# REALTIME DOWNLOADS
url_covid = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'
url_covid_states = 'https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/states_daily_4pm_et.csv'

# Download latest Kaggle data manually: https://www.kaggle.com/sudalairajkumar/covid19-in-usa#us_states_covid19_daily.csv
url_tests_and_hospital = os.path.join(base_output_path, 'us_states_covid19_daily.csv')

In [4]:
# Find last time covid_data was loaded
query_str = open('sql/max_db_date.sql')
query_text = ""
for text in query_str:
    query_text = query_text + text

rs = engine.execute(query_text)

for i in rs:
    last_db_date = (i[0])
print(last_db_date)

2020-04-06


# Number of tests performed and people hospitalized per state

In [5]:
# Test data statistics
df_covid_states = pd.DataFrame()
df_covid_states = pd.read_csv(url_covid_states).fillna(0)

df_covid_states['date'] = pd.to_datetime(df_covid_states['date'],format='%Y%m%d')
df_covid_states['dateChecked'] = pd.to_datetime(df_covid_states['dateChecked']).dt.date
df_covid_states = df_covid_states.drop(['hash'], axis=1)
df_covid_states = df_covid_states.rename(columns = {
        'hospitalizedCurrently':'hospitalized_curr','hospitalizedCumulative':'hospitalized_cum',
        'inIcuCurrently':'in_ICU_curr','inIcuCumulative':'in_ICU_cum',
        'onVentilatorCurrently':'on_vent_curr','onVentilatorCumulative':'on_vent_cum',
        'total':'dontuse_total','dateChecked':'check_date','totalTestResults':'total_tests','posNeg':'pos_neg','deathIncrease':'death_inc',
        'hospitalizedIncrease':'hospital_inc','negativeIncrease':'neg_inc','positiveIncrease':'pos_inc',
        'totalTestResultsIncrease':'tot_tests_inc'})
df_covid_states.head()

Unnamed: 0,date,state,positive,negative,pending,hospitalized_curr,hospitalized_cum,in_ICU_curr,in_ICU_cum,on_vent_curr,...,hospitalized,dontuse_total,total_tests,pos_neg,fips,death_inc,hospital_inc,neg_inc,pos_inc,tot_tests_inc
0,2020-04-07,AK,213.0,6700.0,0.0,0.0,23.0,0.0,0.0,0.0,...,23.0,6913,6913,6913,2,0.0,0.0,8.0,22.0,30.0
1,2020-04-07,AL,2119.0,12797.0,0.0,0.0,271.0,0.0,0.0,0.0,...,271.0,14916,14916,14916,1,6.0,31.0,0.0,151.0,151.0
2,2020-04-07,AR,946.0,12692.0,0.0,74.0,130.0,0.0,43.0,26.0,...,130.0,13638,13638,13638,5,0.0,0.0,722.0,71.0,793.0
3,2020-04-07,AS,0.0,20.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,31,20,20,60,0.0,0.0,0.0,0.0,0.0
4,2020-04-07,AZ,2575.0,30800.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,33375,33375,33375,4,8.0,0.0,722.0,119.0,841.0


In [6]:
df_covid_states.columns

Index(['date', 'state', 'positive', 'negative', 'pending', 'hospitalized_curr',
       'hospitalized_cum', 'in_ICU_curr', 'in_ICU_cum', 'on_vent_curr',
       'on_vent_cum', 'recovered', 'check_date', 'death', 'hospitalized',
       'dontuse_total', 'total_tests', 'pos_neg', 'fips', 'death_inc',
       'hospital_inc', 'neg_inc', 'pos_inc', 'tot_tests_inc'],
      dtype='object')

In [7]:
engine.execute( '''TRUNCATE TABLE covid_data_states''' )
df_covid_states.to_sql(name='covid_data_states', con=engine, if_exists='append', index=False)

# COVID-19 Data from Johns Hopkins

In [8]:
# Read data into Dataframes
df_covid_phase0 = pd.DataFrame()
df_covid_phase1 = pd.DataFrame()
df_covid_phase2 = pd.DataFrame()
df_covid_phase3 = pd.DataFrame()
df_covid_phase4 = pd.DataFrame() 

# Covid data starts at 01-22-2020.csv save in format 2020-02-01 - LOOP over all / just new ones
# Date format has changed 3 times so date ranges to be processed:
# Phase 0: 2020-01-22 - 2020-01-31 (Country/Region = Country)
# Phase 1: 2020-02-01 - 2020-02-29 (Country/Region = Country, Province/State=US County)
# Phase 2: 2020-03-01 - 2020-03-09 (Country/Region = Country, Province/State=US County, NEW: Lat Long)
# Phase 3: 2020-03-10 - 2020-03-21 (Country/Region = Country, Province/State=STATE!!!!! - no more county level)
# Phase 4: 2020-03-22 - onwards (NEW: Admin2 (=County), Active)

# Reset last_db_date only for inital load. Then leave it to setting at start
# last_db_date = datetime.strptime('01-22-2020',"%m-%d-%Y").date()
yesterday_date = datetime.today().date() + timedelta(days=-1)
loop_date = last_db_date + timedelta(days=+1)

print("Last Date in DB:", last_db_date)
print("Yesterday's Date:", yesterday_date)

while loop_date <= yesterday_date:
    
    # Change to date format used in .csv files
    file_date = datetime.strftime(loop_date, "%m-%d-%Y")
    
    # Create URL and get data
    url_covid_file = url_covid + file_date + ".csv"
    print("Now processing: " + url_covid_file)
    df_covid_loop = pd.read_csv(url_covid_file)
    df_covid_loop['short_date'] = loop_date
    
    # Create output path for each file and save file locally then append to overall dataframe
    output_path = os.path.join(base_output_path, "df_covid_" + file_date + ".csv")
    df_covid_loop.to_csv(output_path)
    
    # Split 5 different formats based on date
    if loop_date < datetime.strptime('02-01-2020',"%m-%d-%Y").date(): 
        df_covid_phase0 = df_covid_phase0.append(df_covid_loop)
    elif loop_date <= datetime.strptime('02-29-2020',"%m-%d-%Y").date(): 
        df_covid_phase1 = df_covid_phase1.append(df_covid_loop)
    elif loop_date < datetime.strptime('03-10-2020',"%m-%d-%Y").date():
        df_covid_phase2 = df_covid_phase2.append(df_covid_loop)
    elif loop_date < datetime.strptime('03-22-2020',"%m-%d-%Y").date():
        df_covid_phase3 = df_covid_phase3.append(df_covid_loop)
    else:
        df_covid_phase4 = df_covid_phase4.append(df_covid_loop)
        
        
    # Get the next file
    loop_date = loop_date + timedelta(days=1)

Last Date in DB: 2020-04-06
Yesterday's Date: 2020-04-07
Now processing: https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/04-07-2020.csv


In [9]:
df_covid_phase4.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,short_date
0,45001.0,Abbeville,South Carolina,US,2020-04-07 23:04:49,34.223334,-82.461707,5,0,0,0,"Abbeville, South Carolina, US",2020-04-07
1,22001.0,Acadia,Louisiana,US,2020-04-07 23:04:49,30.295065,-92.414197,82,2,0,0,"Acadia, Louisiana, US",2020-04-07
2,51001.0,Accomack,Virginia,US,2020-04-07 23:04:49,37.767072,-75.632346,11,0,0,0,"Accomack, Virginia, US",2020-04-07
3,16001.0,Ada,Idaho,US,2020-04-07 23:04:49,43.452658,-116.241552,419,3,0,0,"Ada, Idaho, US",2020-04-07
4,19001.0,Adair,Iowa,US,2020-04-07 23:04:49,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US",2020-04-07


In [10]:
df_covid_phase4.fillna(0, inplace=True)
df_covid_phase4['Confirmed'] = df_covid_phase4['Confirmed'].astype(int)
df_covid_phase4['Deaths'] = df_covid_phase4['Deaths'].astype(int)
df_covid_phase4['Recovered'] = df_covid_phase4['Recovered'].astype(int)
df_covid_phase4['Active'] = df_covid_phase4['Active'].astype(int)
df_covid_phase4 = df_covid_phase4.rename(columns = {'Province_State':'province_state','Country_Region':'country_region', 'Confirmed':'confirmed','Deaths':'deaths','Active':'active','Recovered':'recovered','Lat':'latitude','Long_':'longitude','Admin2':'us_county'})

df_covid_4 = df_covid_phase4[['province_state','country_region','confirmed','deaths','recovered','active','short_date','latitude','longitude','us_county']]

df_covid_4.head()     

Unnamed: 0,province_state,country_region,confirmed,deaths,recovered,active,short_date,latitude,longitude,us_county
0,South Carolina,US,5,0,0,0,2020-04-07,34.223334,-82.461707,Abbeville
1,Louisiana,US,82,2,0,0,2020-04-07,30.295065,-92.414197,Acadia
2,Virginia,US,11,0,0,0,2020-04-07,37.767072,-75.632346,Accomack
3,Idaho,US,419,3,0,0,2020-04-07,43.452658,-116.241552,Ada
4,Iowa,US,1,0,0,0,2020-04-07,41.330756,-94.471059,Adair


In [11]:
# engine.execute( '''TRUNCATE TABLE covid_data_phase4''' )        
df_covid_4.to_sql(name='covid_data_4', con=engine, if_exists='append', index=False) 