# Data Retrieval

In [10]:
import awswrangler as wr
import requests
import csv

In [11]:
requests.get("https://covid19-lake.s3.us-east-2.amazonaws.com/rearc-covid-19-nyt-data-in-usa/csv/us-states/us-states.csv")

<Response [200]>

In [13]:
with open("us-covid-data.csv",  "wb") as file:
    file.write(pdf.content)

TypeError: write() takes exactly one argument (0 given)

In [None]:
bucket_name = s3.list_objects(Bucket='covid19-lake')

In [None]:
objects = s3.list_objects(Bucket=bucket_name)

In [None]:
# Import Pandas, Config Variables, and SQLalchemy
import pandas as pd
from config import endpoint, username, password
from sqlalchemy import create_engine

In [None]:
# Read CSVs for State Data
state_pop_df = pd.read_csv("../data/state_populations_cleaned.csv")
state_vacc_df = pd.read_csv("../data/state_covid_vaccinations_cleaned.csv")
state_latlons_df = pd.read_csv("../data/statelatlong.csv")

In [None]:
# State Populations
state_pop_df.head()

In [None]:
# State Vaccinations
state_vacc_df.head()

In [None]:
state_latlons_df.head()

In [None]:
state_latlons_df = state_latlons_df.rename(columns={"State": "Abb", "City": "state"})

In [None]:
state_latlons_df.head()

In [None]:
# State Data Merged
state_merged_df = pd.merge(state_pop_df, state_vacc_df, how="inner", on="state")
state_merged_df.head()

In [None]:
state_merged_df = pd.merge(state_merged_df, state_latlons_df, how="left", on="state")

In [None]:
state_merged_df.head()

In [None]:
# Set State as the Index
state_info_df = state_merged_df.set_index("state") 
state_info_df.head()

In [None]:
# Add Empty Column for New Calculation
state_info_df["percent_vaccinated"] = ""
state_info_df.head(10)

In [None]:
# Calcuation for percent vaccinated by state
for index, row in state_info_df.iterrows():
    population = row["population"]
    vaccinated = row["total_administered"]
    percent_vaccinated = round((vaccinated / population) * 100, 2)
    
    state_info_df.at[index, "percent_vaccinated"] = percent_vaccinated

In [None]:
# Check the data types
state_info_df.info()

In [None]:
# Convert percent vaccinated into a float
state_info_df = state_info_df.astype({"percent_vaccinated": 'float64'})
state_info_df.info()

In [None]:
state_info_df.head(10)

## NIH AWS Data / NYT

### Set up State Master DF

In [None]:
# Read NYT / NIH json 
nytimes_nih_covid_df = pd.read_json("../data/daily_covid_stats_by_state_nyt.json", lines=True)

In [None]:
# Get Data for Most Recent Provided Date
most_recent_date = "2021-02-22"
latest_nyt_stateData = nytimes_nih_covid_df[nytimes_nih_covid_df['date'] == f'{most_recent_date}']

In [None]:
latest_nyt_stateData.head(10)

In [None]:
import datetime as dt

for index, row in nytimes_nih_covid_df.iterrows():
    date = row["date"].strftime('%Y-%m-%d')
    nytimes_nih_covid_df.at[index, "date"] = date

In [None]:
nytimes_nih_covid_df.info()

In [None]:
# Merge with State Info DF that contains population and vaccinations
latest_data_merged = pd.merge(state_info_df, latest_nyt_stateData, on="state")
latest_data_merged.head(10)

In [None]:
# Create a Master Dataframe to start working with
state_master_df = latest_data_merged[['state', 'Abb', 'date', 'population', 'Latitude', 'Longitude', 'cases', 'deaths', 'total_distributed', 'total_administered', 'percent_vaccinated' ]]
state_master_df.head(10)

In [None]:
# Create empty column to calculate estimated percent infected to date
state_master_df["est_percent_infected_to_date"] = ""
state_master_df.head(10)

In [None]:
# Calcuation for est percent infected by state
for index, row in state_master_df.iterrows():
    population = row["population"]
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    state_master_df.at[index, "est_percent_infected_to_date"] = percent_infected

In [None]:
state_master_df.head(10)

In [None]:
# Add Estimated Percent Immune Column
state_master_df["est_percent_immune"] = ""
state_master_df.head(10)

In [None]:
# Calcuation for estimated immune by state
for index, row in state_master_df.iterrows():
    population = row["population"]
    infected = row["cases"]
    deaths = row["deaths"]
    vaccinated = row["total_administered"]
    
    est_total_immune = infected + vaccinated
    est_population = population - deaths
    
    percent_immune = round((est_total_immune / est_population) * 100, 2)
    
    state_master_df.at[index, "est_percent_immune"] = percent_immune

In [None]:
state_master_df.head(10)

### Set up Daily Cases DF

In [None]:
nyt_avg_daily_cases = nytimes_nih_covid_df
nyt_avg_daily_cases.head()

In [None]:
nyt_avg_daily_cases = nyt_avg_daily_cases.groupby("date").sum()
nyt_avg_daily_cases.head()

In [None]:
nyt_avg_daily_cases["daily_new_cases"] = ""

In [None]:
nyt_avg_daily_cases.head()

In [None]:
previous_day = 0

In [None]:
for index, row in nyt_avg_daily_cases.iterrows():
    new_cases = row["cases"] - previous_day
    previous_day = row["cases"]
    nyt_avg_daily_cases.at[index, "daily_new_cases"] = new_cases

In [None]:
nyt_avg_daily_cases["date"] = ""

In [None]:
nyt_avg_daily_cases

In [None]:
for index, row in nyt_avg_daily_cases.iterrows():
    date = index
    nyt_avg_daily_cases.at[index, "date"] = date

In [None]:
nyt_avg_daily_cases.reset_index(drop=True, inplace=True)

In [None]:
nyt_avg_daily_cases.head(10)

### Set Up NYT Totals Master DF

In [None]:
nyt_master = pd.merge(nytimes_nih_covid_df, state_pop_df, how ="inner", on="state")
nyt_master.head()

In [None]:
nyt_master["est_percent_infected"] = ""
nyt_master.head()

In [None]:
# Calcuation for est percent infected by state
for index, row in nyt_master.iterrows():
    population = int(row["population"])
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    if infected == 0:
        percent_infected = 0
    else: 
        percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    nyt_master.at[index, "est_percent_infected"] = percent_infected

In [None]:
nyt_master.head()

## JSON Exports

In [None]:
# Master DF to JSON
state_master_df.to_json("../data/state-master-data.json", orient="records")

In [None]:
# NYT Master to JSON 
nyt_master.to_json("../data/nyt-master.json", orient="records")

In [None]:
nyt_avg_daily_cases.to_json("../data/daily_new_cases.json", orient="records")

### CSV Exports

In [None]:
state_master_df.to_csv("../data/state-master-data.csv")

## Amazon RDS Updates

In [None]:
# Connect to AWS Database instance 
engine = create_engine(f'postgresql://uscovid:{password}@{endpoint}/us_covid_db')
connection = engine.connect()

In [None]:
# 50 States & D.C. Data
state_master_df.to_sql('master_table', index=False, if_exists='replace', con=connection)

In [None]:
# Daily Cases Data
nyt_avg_daily_cases.to_sql("daily_new_cases", index=False, if_exists='replace', con=connection)

In [None]:
# Raw NTY Data
nytimes_nih_covid_df.to_sql('nyt_table', index=True, if_exists='replace', con=connection)