In [1]:
import pandas as pd
import numpy as np

import wget
import os, datetime
import shutil

import requests
from bs4 import BeautifulSoup

import glob

In [2]:
def scrape_data(search_url, download_url):

    response = requests.get(search_url, timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())
    
    table = soup.find_all('span')
    header = [th.text.rstrip() for th in soup.find_all('span')]
    download_urls = []
    for row in header:
        if 'csv' in row and len(row) < 15:
            download_urls.append(download_url + row.rstrip())

    return download_urls

In [3]:
# create dir
def createDir(currDir):
    isdir = os.path.isdir(currDir) 

    if isdir:
        try:
            shutil.rmtree(currDir, ignore_errors=True)
        except OSError:
            print ("Deletion of the directory %s failed" % currDir)

    try:
        os.mkdir(currDir)
    except OSError:
        print ("Creation of the directory %s failed" % currDir)
    else:
        print ("Successfully created the directory %s " % currDir)

In [4]:
# download files
def downloadFiles(urls, currDir):
    for url in urls:
        wget.download(url, currDir)
    print ("Successfully downloaded files")

In [5]:
def transformFiles(currDir):
    # get data file names
    filenames = glob.glob(currDir  + "/*.csv")

    dfs = []
    for filename in filenames:
        tmp = pd.read_csv(filename)
        tmp['Last_Update'] = filename[40:50]
        tmp['Last_Update'] = pd.to_datetime(tmp.Last_Update)
        dfs.append(tmp)

    # Concatenate all data into one DataFrame
    us_state_table = pd.concat(dfs, ignore_index=True)
    
    # Replace Null values
    us_state_table = us_state_table.replace(np.nan, '', regex=True)
    us_state_table = us_state_table[us_state_table['Province_State'].str.contains('Recovered')!=True]
    print("Table Shape: ", us_state_table.shape)
    us_state_table = us_state_table.sort_values(by='Last_Update')
    return us_state_table

In [13]:
# Save to csv file
def saveFiletoCSV(usa_state_table, currDir):
    usa_state_table.to_csv(currDir + '/covid_19_us_states_complete.csv', index=False)
    print("File Saved at %s" % currDir)

In [15]:
#urls for github folder
search_url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports_us"
download_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/"
download_urls = scrape_data(search_url, download_url)

currDir = "../../DataStore/COVID-19-data-state-USA"

#createDir(currDir)
#downloadFiles(download_urls, currDir)
usa_state_table = transformFiles(currDir)
saveFiletoCSV(usa_state_table, currDir)

Table Shape:  (13398, 20)
File Saved at ../../DataStore/COVID-19-data-state-USA


In [9]:
usa_state_table

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate,Total_Test_Results,Case_Fatality_Ratio
4358,Alabama,US,2020-04-12,32.3182,-86.9023,3667,93,,3470,1,75.988,21583,437,2.61016,84000001.0,USA,460.3,12.2649,,
4390,New Hampshire,US,2020-04-12,43.4525,-71.5639,929,23,236,906,33,69.947,10925,146,2.47578,84000033.0,USA,822.574,15.7158,,
4391,New Jersey,US,2020-04-12,40.2989,-74.521,61850,2350,,59500,34,696.337,126735,7604,3.79951,84000034.0,USA,1426.84,12.2943,,
4392,New Mexico,US,2020-04-12,34.8405,-106.249,1245,26,235,1219,35,74.6612,28692,78,2.08835,84000035.0,USA,1720.63,6.26506,,
4393,New York,US,2020-04-12,42.1657,-74.9481,189033,9385,23887,179648,36,1121.12,461601,42594,4.96474,84000036.0,USA,2737.68,22.5326,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8044,Michigan,US,2020-11-28,43.3266,-84.5361,378152,9467,165269,203416,26,3786.5,,,,84000026.0,USA,65830.9,,6.57444e+06,2.50349
8045,Minnesota,US,2020-11-28,45.6945,-93.9002,304023,3580,257485,42958,27,5390.83,,,,84000027.0,USA,71409.9,,4.02726e+06,1.17754
8046,Mississippi,US,2020-11-28,32.7416,-89.6787,149940,3779,121637,24524,28,5038.05,,,,84000028.0,USA,36866.8,,1.09721e+06,2.52034
8048,Montana,US,2020-11-28,46.9219,-110.454,60845,669,44100,16076,30,5692.95,,,,84000030.0,USA,60386.1,,645393,1.09952
