# Data Retrieval

In [1]:
# Import dependancies
from bs4 import BeautifulSoup as bs
from splinter import Browser
import requests
import csv
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import datetime as dt

In [2]:
# Get Vaccinations Table from CDC website
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit("https://covid.cdc.gov/covid-data-tracker/#vaccinations")
vaccine_html = browser.html
browser.click_link_by_id("vaccinations-table-toggle")
cdc_html = browser.html
cdc_parsed = bs(cdc_html, "html.parser")
table = cdc_parsed.select("table")
browser.quit()

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\coled\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [3]:
# Read the CDC Vaccine HTML Table
vaccines_df = pd.read_html(str(table))[0]

In [4]:
# Change "New York State" to "New York" for data merging
vaccines_df.loc[vaccines_df["State/Territory/Federal Entity"] == "New York State", "State/Territory/Federal Entity"] = "New York"

In [5]:
# Rename "State/Territory/Federal Entity" column to "state" for data merging
vaccines_df = vaccines_df.rename(columns={"State/Territory/Federal Entity": "state"})

In [6]:
# Overwrite Latest Vaccine CSV
vaccines_df.to_csv("../data/cdc-vaccine-data.csv")

In [7]:
# Read Daily COVID-19 CSV from AWS S3 Bucket - Rearc / NY Times Data 
us_covid_cases_data = requests.get("https://covid19-lake.s3.us-east-2.amazonaws.com/rearc-covid-19-nyt-data-in-usa/csv/us-states/us-states.csv", stream=True)

In [8]:
# Overwrite the US COVID Data CSV with the latest one 
with open("../data/us-covid-data.csv",  "wb") as file:
    file.write(us_covid_cases_data.content)

In [9]:
# Read US COVID Data CSV for data merging and cleaning 
us_covid_cases_df = pd.read_csv("../data/us-covid-data.csv")

# Clean and Create State Overview Dataframe

In [10]:
# Get yesterday's date, which is the latest data 
today = dt.date.today()
yesterday = today - dt.timedelta(days = 1)
latest_data = yesterday.strftime("%Y-%m-%d")

In [11]:
# Get latest total cases by state for latest date
us_total_cases_to_date_df = us_covid_cases_df.groupby(["date", "state"]).sum()
us_total_cases_to_date_df.reset_index(inplace=True)
us_total_cases_to_date_df = us_total_cases_to_date_df.loc[us_total_cases_to_date_df["date"] == latest_data]

In [12]:
# Read State Lat-Lon CSV and rename columns for easier cleaning and merging
state_latlons_df = pd.read_csv("../data/statelatlong.csv")
state_latlons_df.rename(columns={"State": "abbr", "Latitude": "lat", "Longitude": "lon", "City": "state"}, inplace=True)

In [13]:
# Read State Population CSV for cleaning and merging
state_pop_df = pd.read_csv("../data/state_populations_cleaned.csv")

In [14]:
# Merge, rename, and focus on defined columns
state_overview_master_df = pd.merge(state_pop_df, state_latlons_df, how="inner", on="state")
state_overview_master_df = state_overview_master_df[["state", "abbr", "lat", "lon", "population"]]

In [15]:
# Merge with us_total_cases_to_date_df and focus on defined columns
state_overview_master_df = pd.merge(state_overview_master_df, us_total_cases_to_date_df, how="inner", on="state")
state_overview_master_df = state_overview_master_df[["date", "state", "abbr", "lat", "lon", "population", "fips", "cases", "deaths"]]

In [16]:
# Merge with Vaccination data, rename columns, focus data columns
state_overview_master_df = pd.merge(state_overview_master_df, vaccines_df, how="inner", on="state")
state_overview_master_df = state_overview_master_df.rename(columns={"Total Doses Administered by State where Administered": "total_doses_administered", "Doses Administered per 100k by State where Administered": "doses_administered_per_100k"})
state_overview_master_df = state_overview_master_df[["date", "state", "abbr", "lat", "lon", "population", "fips", "cases", "deaths", "total_doses_administered", "doses_administered_per_100k"]]

In [17]:
# Add Empty Column for New Calculation
state_overview_master_df["percent_vaccinated"] = ""

In [18]:
# Calcuation for percent vaccinated by state
for index, row in state_overview_master_df.iterrows():
    population = row["population"]
    vaccinated = row["total_doses_administered"]
    percent_vaccinated = round((vaccinated / population) * 100, 2)
    
    state_overview_master_df.at[index, "percent_vaccinated"] = percent_vaccinated

In [19]:
# Convert percent vaccinated into a float
state_overview_master_df = state_overview_master_df.astype({"percent_vaccinated": 'float64'})

In [20]:
state_overview_master_df

Unnamed: 0,date,state,abbr,lat,lon,population,fips,cases,deaths,total_doses_administered,doses_administered_per_100k,percent_vaccinated
0,2021-02-27,Alabama,AL,32.601011,-86.680736,4903185,1,492683,9930,942502,19222.0,19.22
1,2021-02-27,Alaska,AK,61.302501,-158.77502,731545,2,57846,279,264741,36189.0,36.19
2,2021-02-27,Arizona,AZ,34.168219,-111.930907,7278717,4,816334,15967,1792447,24626.0,24.63
3,2021-02-27,Arkansas,AR,34.751928,-92.131378,3017804,5,319195,5417,649311,21516.0,21.52
4,2021-02-27,California,CA,37.271875,-119.270415,39512223,6,3565496,51979,8821044,22325.0,22.32
5,2021-02-27,Colorado,CO,38.997934,-105.550567,5758736,8,430129,6038,1371577,23817.0,23.82
6,2021-02-27,Connecticut,CT,41.518784,-72.757507,3565287,9,279946,7622,1004467,28174.0,28.17
7,2021-02-27,Delaware,DE,39.145251,-75.418921,973764,10,86517,1418,219312,22522.0,22.52
8,2021-02-27,District of Columbia,DC,38.899349,-77.014567,705749,11,40478,1010,194746,27594.0,27.59
9,2021-02-27,Florida,FL,27.975728,-83.833017,21477737,12,1903674,30733,5044420,23487.0,23.49


In [22]:
us_covid_cases_df["date"] = pd.to_datetime(us_covid_cases_df["date"])

In [23]:
us_covid_cases_df

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0
...,...,...,...,...,...
19919,2021-02-27,Virginia,51,574314,8382
19920,2021-02-27,Washington,53,342574,5024
19921,2021-02-27,West Virginia,54,131580,2297
19922,2021-02-27,Wisconsin,55,616899,7019


## NIH AWS Data / NYT

### Set up State Master DF

In [None]:
# import datetime as dt

# for index, row in nytimes_nih_covid_df.iterrows():
#     date = row["date"].strftime('%Y-%m-%d')
#     nytimes_nih_covid_df.at[index, "date"] = date

In [None]:
# # Calcuation for est percent infected by state
# for index, row in state_master_df.iterrows():
#     population = row["population"]
#     deaths = row["deaths"]
#     infected = row["cases"]
    
#     # take out the number who have passed away to reset estimated population
#     est_population_new = population - deaths 
    
#     # estimate the percent infected
#     percent_infected = round((infected / est_population_new) * 100, 2)
    
#     # insert into percent_infected column
#     state_master_df.at[index, "est_percent_infected_to_date"] = percent_infected

In [None]:
# Add Estimated Percent Immune Column
state_master_df["est_percent_immune"] = ""
state_master_df.head(10)

In [None]:
# Calcuation for estimated immune by state
for index, row in state_master_df.iterrows():
    population = row["population"]
    infected = row["cases"]
    deaths = row["deaths"]
    vaccinated = row["total_administered"]
    
    est_total_immune = infected + vaccinated
    est_population = population - deaths
    
    percent_immune = round((est_total_immune / est_population) * 100, 2)
    
    state_master_df.at[index, "est_percent_immune"] = percent_immune

In [None]:
state_master_df.head(10)

### Set up Daily Cases DF

In [None]:
nyt_avg_daily_cases = nytimes_nih_covid_df
nyt_avg_daily_cases.head()

In [None]:
nyt_avg_daily_cases = nyt_avg_daily_cases.groupby("date").sum()
nyt_avg_daily_cases.head()

In [None]:
nyt_avg_daily_cases["daily_new_cases"] = ""

In [None]:
nyt_avg_daily_cases.head()

In [None]:
previous_day = 0

In [None]:
for index, row in nyt_avg_daily_cases.iterrows():
    new_cases = row["cases"] - previous_day
    previous_day = row["cases"]
    nyt_avg_daily_cases.at[index, "daily_new_cases"] = new_cases

In [None]:
nyt_avg_daily_cases["date"] = ""

In [None]:
nyt_avg_daily_cases

In [None]:
for index, row in nyt_avg_daily_cases.iterrows():
    date = index
    nyt_avg_daily_cases.at[index, "date"] = date

In [None]:
nyt_avg_daily_cases.reset_index(drop=True, inplace=True)

In [None]:
nyt_avg_daily_cases.head(10)

### Set Up NYT Totals Master DF

In [None]:
nyt_master = pd.merge(nytimes_nih_covid_df, state_pop_df, how ="inner", on="state")
nyt_master.head()

In [None]:
nyt_master["est_percent_infected"] = ""
nyt_master.head()

In [None]:
# Calcuation for est percent infected by state
for index, row in nyt_master.iterrows():
    population = int(row["population"])
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    if infected == 0:
        percent_infected = 0
    else: 
        percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    nyt_master.at[index, "est_percent_infected"] = percent_infected

In [None]:
nyt_master.head()

## JSON Exports

In [None]:
# Master DF to JSON
state_master_df.to_json("../data/state-master-data.json", orient="records")

In [None]:
# NYT Master to JSON 
nyt_master.to_json("../data/nyt-master.json", orient="records")

In [None]:
nyt_avg_daily_cases.to_json("../data/daily_new_cases.json", orient="records")

### CSV Exports

In [None]:
state_master_df.to_csv("../data/state-master-data.csv")

## Amazon RDS Updates

In [None]:
# Config Variables, and SQLalchemy
from config import endpoint, username, password
from sqlalchemy import create_engine

In [None]:
# Connect to AWS Database instance 
engine = create_engine(f'postgresql://uscovid:{password}@{endpoint}/us_covid_db')
connection = engine.connect()

In [None]:
# 50 States & D.C. Data
state_master_df.to_sql('master_table', index=False, if_exists='replace', con=connection)

In [None]:
# Daily Cases Data
nyt_avg_daily_cases.to_sql("daily_new_cases", index=False, if_exists='replace', con=connection)

In [None]:
# Raw NTY Data
nytimes_nih_covid_df.to_sql('nyt_table', index=True, if_exists='replace', con=connection)