In [1]:
# Import Pandas
import pandas as pd
import psycopg2
from config import endpoint, username, password
from sqlalchemy import create_engine

In [2]:
# Read CSVs for State Data
state_pop_df = pd.read_csv("data/state_populations_cleaned.csv")
state_vacc_df = pd.read_csv("data/state_covid_vaccinations_cleaned.csv")

In [3]:
# State Populations
state_pop_df.head()

Unnamed: 0,state,population
0,Alabama,4903185
1,Alaska,731545
2,Arizona,7278717
3,Arkansas,3017804
4,California,39512223


In [4]:
# State Vaccinations
state_vacc_df.head()

Unnamed: 0,state,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
0,Alaska,154325,71907,21096,9829,59449,8126,12352,1688
1,Alabama,493125,165919,10057,3384,144429,2946,21345,435
2,Arkansas,361550,173312,11981,5743,144792,4798,26541,879
3,Arizona,797550,303899,10957,4175,262120,3601,41613,572
4,California,4379500,1633875,11084,4135,1335886,3381,293834,744


In [5]:
# State Data Merged
state_merged_df = pd.merge(state_pop_df, state_vacc_df, how="left", on="state")
state_merged_df.head()

Unnamed: 0,state,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
0,Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435
1,Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688
2,Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572
3,Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879
4,California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744


In [6]:
# Set State as the Index
state_info_df = state_merged_df.set_index("state") 
state_info_df.head()

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435
Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688
Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572
Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879
California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744


In [7]:
# Add Empty Column for New Calculation
state_info_df["percent_vaccinated"] = ""
state_info_df.head(10)

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,percent_vaccinated
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435,
Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688,
Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572,
Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879,
California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744,
Colorado,5758736,657250,353194,11413,6133,296816,5154,54474,946,
Connecticut,3565287,435075,264707,12203,7425,230913,6477,28602,802,
Delaware,973764,106600,49369,10947,5070,41900,4303,7355,755,
District of Columbia,705749,75175,55108,10652,7808,43377,6146,11685,1656,
Florida,21477737,2546050,1239975,11854,5773,1125338,5240,111763,520,


In [9]:
# Calcuation for percent vaccinated by state
for index, row in state_info_df.iterrows():
    population = row["population"]
    vaccinated = row["total_administered"]
    percent_vaccinated = round((vaccinated / population) * 100, 2)
    
    state_info_df.at[index, "percent_vaccinated"] = percent_vaccinated

In [10]:
# Check the data types
state_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   population                        51 non-null     int64 
 1   total_distributed                 51 non-null     int64 
 2   total_administered                51 non-null     int64 
 3   distributed_per_100K              51 non-null     int64 
 4   administered_per_100K             51 non-null     int64 
 5   people_with_1plus_doses           51 non-null     int64 
 6   people_with_1plus_doses_per_100K  51 non-null     int64 
 7   people_with_2_doses               51 non-null     int64 
 8   people_with_2_doses_per_100K      51 non-null     int64 
 9   percent_vaccinated                51 non-null     object
dtypes: int64(9), object(1)
memory usage: 6.9+ KB


In [12]:
# Convert percent vaccinated into a float
state_info_df = state_info_df.astype({"percent_vaccinated": 'float64'})
state_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   population                        51 non-null     int64  
 1   total_distributed                 51 non-null     int64  
 2   total_administered                51 non-null     int64  
 3   distributed_per_100K              51 non-null     int64  
 4   administered_per_100K             51 non-null     int64  
 5   people_with_1plus_doses           51 non-null     int64  
 6   people_with_1plus_doses_per_100K  51 non-null     int64  
 7   people_with_2_doses               51 non-null     int64  
 8   people_with_2_doses_per_100K      51 non-null     int64  
 9   percent_vaccinated                51 non-null     float64
dtypes: float64(1), int64(9)
memory usage: 4.4+ KB


In [13]:
# Export as json
state_info_df.to_json("data/state_data.json")

In [14]:
# Connect to AWS Database instance 
engine = create_engine(f'postgresql://uscovid:{password}@{endpoint}/us_covid_db')
connection = engine.connect()

In [15]:
# Add to Database and replace if the table already exists
state_info_df.to_sql('master_table', index=True, if_exists='replace', con=connection)

## NIH AWS Data / NYT

In [22]:
# Read NYT / NIH json 
nytimes_nih_covid_df = pd.read_json("data/daily_covid_stats_by_state_nyt.json", lines=True)

In [40]:
# Add data to database 
nytimes_nih_covid_df.to_sql('nyt_table', index=False, if_exists='replace', con=connection)