## State Population and Vaccination Data

In [1]:
# Import Pandas, Config Variables, and SQLalchemy
import pandas as pd
from config import endpoint, username, password
from sqlalchemy import create_engine

In [2]:
# Read CSVs for State Data
state_pop_df = pd.read_csv("../data/state_populations_cleaned.csv")
state_vacc_df = pd.read_csv("../data/state_covid_vaccinations_cleaned.csv")

In [3]:
# State Populations
state_pop_df.head()

Unnamed: 0,state,population
0,Alabama,4903185
1,Alaska,731545
2,Arizona,7278717
3,Arkansas,3017804
4,California,39512223


In [4]:
# State Vaccinations
state_vacc_df.head()

Unnamed: 0,state,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
0,Alaska,154325,71907,21096,9829,59449,8126,12352,1688
1,Alabama,493125,165919,10057,3384,144429,2946,21345,435
2,Arkansas,361550,173312,11981,5743,144792,4798,26541,879
3,Arizona,797550,303899,10957,4175,262120,3601,41613,572
4,California,4379500,1633875,11084,4135,1335886,3381,293834,744


In [5]:
# State Data Merged
state_merged_df = pd.merge(state_pop_df, state_vacc_df, how="left", on="state")
state_merged_df.head()

Unnamed: 0,state,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
0,Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435
1,Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688
2,Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572
3,Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879
4,California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744


In [6]:
# Set State as the Index
state_info_df = state_merged_df.set_index("state") 
state_info_df.head()

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435
Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688
Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572
Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879
California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744


In [7]:
# Add Empty Column for New Calculation
state_info_df["percent_vaccinated"] = ""
state_info_df.head(10)

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,percent_vaccinated
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435,
Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688,
Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572,
Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879,
California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744,
Colorado,5758736,657250,353194,11413,6133,296816,5154,54474,946,
Connecticut,3565287,435075,264707,12203,7425,230913,6477,28602,802,
Delaware,973764,106600,49369,10947,5070,41900,4303,7355,755,
District of Columbia,705749,75175,55108,10652,7808,43377,6146,11685,1656,
Florida,21477737,2546050,1239975,11854,5773,1125338,5240,111763,520,


In [8]:
# Calcuation for percent vaccinated by state
for index, row in state_info_df.iterrows():
    population = row["population"]
    vaccinated = row["total_administered"]
    percent_vaccinated = round((vaccinated / population) * 100, 2)
    
    state_info_df.at[index, "percent_vaccinated"] = percent_vaccinated

In [9]:
# Check the data types
state_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   population                        51 non-null     int64 
 1   total_distributed                 51 non-null     int64 
 2   total_administered                51 non-null     int64 
 3   distributed_per_100K              51 non-null     int64 
 4   administered_per_100K             51 non-null     int64 
 5   people_with_1plus_doses           51 non-null     int64 
 6   people_with_1plus_doses_per_100K  51 non-null     int64 
 7   people_with_2_doses               51 non-null     int64 
 8   people_with_2_doses_per_100K      51 non-null     int64 
 9   percent_vaccinated                51 non-null     object
dtypes: int64(9), object(1)
memory usage: 6.9+ KB


In [10]:
# Convert percent vaccinated into a float
state_info_df = state_info_df.astype({"percent_vaccinated": 'float64'})
state_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   population                        51 non-null     int64  
 1   total_distributed                 51 non-null     int64  
 2   total_administered                51 non-null     int64  
 3   distributed_per_100K              51 non-null     int64  
 4   administered_per_100K             51 non-null     int64  
 5   people_with_1plus_doses           51 non-null     int64  
 6   people_with_1plus_doses_per_100K  51 non-null     int64  
 7   people_with_2_doses               51 non-null     int64  
 8   people_with_2_doses_per_100K      51 non-null     int64  
 9   percent_vaccinated                51 non-null     float64
dtypes: float64(1), int64(9)
memory usage: 6.9+ KB


In [11]:
state_info_df.head()

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,percent_vaccinated
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435,3.38
Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688,9.83
Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572,4.18
Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879,5.74
California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744,4.14


## NIH AWS Data / NYT

In [64]:
# Read NYT / NIH json 
nytimes_nih_covid_df = pd.read_json("../data/daily_covid_stats_by_state_nyt.json", lines=True)

In [65]:
# Get Data for Most Recent Provided Date
most_recent_date = "2021-01-28"
latest_nyt_stateData = nytimes_nih_covid_df[nytimes_nih_covid_df['date'] == f'{most_recent_date}']

In [66]:
latest_nyt_stateData.head()

Unnamed: 0,date,state,fips,cases,deaths
18219,2021-01-28,Alabama,1,452734,7340
18220,2021-01-28,Alaska,2,53524,253
18221,2021-01-28,Arizona,4,745976,12861
18222,2021-01-28,Arkansas,5,290856,4784
18223,2021-01-28,California,6,3272349,39527


In [67]:
# Merge with State Info DF that contains population and vaccinations
latest_data_merged = pd.merge(state_info_df, latest_nyt_stateData, on="state")
latest_data_merged.head()

Unnamed: 0,state,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,percent_vaccinated,date,fips,cases,deaths
0,Alabama,4903185,493125,165919,10057,3384,144429,2946,21345,435,3.38,2021-01-28,1,452734,7340
1,Alaska,731545,154325,71907,21096,9829,59449,8126,12352,1688,9.83,2021-01-28,2,53524,253
2,Arizona,7278717,797550,303899,10957,4175,262120,3601,41613,572,4.18,2021-01-28,4,745976,12861
3,Arkansas,3017804,361550,173312,11981,5743,144792,4798,26541,879,5.74,2021-01-28,5,290856,4784
4,California,39512223,4379500,1633875,11084,4135,1335886,3381,293834,744,4.14,2021-01-28,6,3272349,39527


In [68]:
# Create a Master Dataframe to start working with
master_df = latest_data_merged[['state', 'date', 'population', 'cases', 'deaths', 'total_distributed', 'total_administered', 'percent_vaccinated' ]]
master_df.head(10)

Unnamed: 0,state,date,population,cases,deaths,total_distributed,total_administered,percent_vaccinated
0,Alabama,2021-01-28,4903185,452734,7340,493125,165919,3.38
1,Alaska,2021-01-28,731545,53524,253,154325,71907,9.83
2,Arizona,2021-01-28,7278717,745976,12861,797550,303899,4.18
3,Arkansas,2021-01-28,3017804,290856,4784,361550,173312,5.74
4,California,2021-01-28,39512223,3272349,39527,4379500,1633875,4.14
5,Colorado,2021-01-28,5758736,394668,5670,657250,353194,6.13
6,Connecticut,2021-01-28,3565287,248765,7020,435075,264707,7.42
7,Delaware,2021-01-28,973764,76495,1075,106600,49369,5.07
8,District of Columbia,2021-01-28,705749,36132,902,75175,55108,7.81
9,Florida,2021-01-28,21477737,1687586,26034,2546050,1239975,5.77


In [69]:
# Create empty column to calculate estimated percent infected to date
master_df["est_percent_infected_to_date"] = ""
master_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,state,date,population,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date
0,Alabama,2021-01-28,4903185,452734,7340,493125,165919,3.38,
1,Alaska,2021-01-28,731545,53524,253,154325,71907,9.83,
2,Arizona,2021-01-28,7278717,745976,12861,797550,303899,4.18,
3,Arkansas,2021-01-28,3017804,290856,4784,361550,173312,5.74,
4,California,2021-01-28,39512223,3272349,39527,4379500,1633875,4.14,
5,Colorado,2021-01-28,5758736,394668,5670,657250,353194,6.13,
6,Connecticut,2021-01-28,3565287,248765,7020,435075,264707,7.42,
7,Delaware,2021-01-28,973764,76495,1075,106600,49369,5.07,
8,District of Columbia,2021-01-28,705749,36132,902,75175,55108,7.81,
9,Florida,2021-01-28,21477737,1687586,26034,2546050,1239975,5.77,


In [70]:
# Calcuation for est percent infected by state
for index, row in master_df.iterrows():
    population = row["population"]
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    master_df.at[index, "est_percent_infected_to_date"] = percent_infected

In [71]:
master_df.head(10)

Unnamed: 0,state,date,population,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date
0,Alabama,2021-01-28,4903185,452734,7340,493125,165919,3.38,9.25
1,Alaska,2021-01-28,731545,53524,253,154325,71907,9.83,7.32
2,Arizona,2021-01-28,7278717,745976,12861,797550,303899,4.18,10.27
3,Arkansas,2021-01-28,3017804,290856,4784,361550,173312,5.74,9.65
4,California,2021-01-28,39512223,3272349,39527,4379500,1633875,4.14,8.29
5,Colorado,2021-01-28,5758736,394668,5670,657250,353194,6.13,6.86
6,Connecticut,2021-01-28,3565287,248765,7020,435075,264707,7.42,6.99
7,Delaware,2021-01-28,973764,76495,1075,106600,49369,5.07,7.86
8,District of Columbia,2021-01-28,705749,36132,902,75175,55108,7.81,5.13
9,Florida,2021-01-28,21477737,1687586,26034,2546050,1239975,5.77,7.87


In [72]:
# Add Estimated Percent Immune Column
master_df["est_percent_immune"] = ""
master_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,state,date,population,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date,est_percent_immune
0,Alabama,2021-01-28,4903185,452734,7340,493125,165919,3.38,9.25,
1,Alaska,2021-01-28,731545,53524,253,154325,71907,9.83,7.32,
2,Arizona,2021-01-28,7278717,745976,12861,797550,303899,4.18,10.27,
3,Arkansas,2021-01-28,3017804,290856,4784,361550,173312,5.74,9.65,
4,California,2021-01-28,39512223,3272349,39527,4379500,1633875,4.14,8.29,
5,Colorado,2021-01-28,5758736,394668,5670,657250,353194,6.13,6.86,
6,Connecticut,2021-01-28,3565287,248765,7020,435075,264707,7.42,6.99,
7,Delaware,2021-01-28,973764,76495,1075,106600,49369,5.07,7.86,
8,District of Columbia,2021-01-28,705749,36132,902,75175,55108,7.81,5.13,
9,Florida,2021-01-28,21477737,1687586,26034,2546050,1239975,5.77,7.87,


In [73]:
# Calcuation for estimated immune by state
for index, row in master_df.iterrows():
    population = row["population"]
    infected = row["cases"]
    deaths = row["deaths"]
    vaccinated = row["total_administered"]
    
    est_total_immune = infected + vaccinated
    est_population = population - deaths
    
    percent_infected = round((est_total_immune / est_population) * 100, 2)
    
    master_df.at[index, "est_percent_immune"] = percent_infected

In [74]:
master_df.head(10)

Unnamed: 0,state,date,population,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date,est_percent_immune
0,Alabama,2021-01-28,4903185,452734,7340,493125,165919,3.38,9.25,12.64
1,Alaska,2021-01-28,731545,53524,253,154325,71907,9.83,7.32,17.15
2,Arizona,2021-01-28,7278717,745976,12861,797550,303899,4.18,10.27,14.45
3,Arkansas,2021-01-28,3017804,290856,4784,361550,173312,5.74,9.65,15.41
4,California,2021-01-28,39512223,3272349,39527,4379500,1633875,4.14,8.29,12.43
5,Colorado,2021-01-28,5758736,394668,5670,657250,353194,6.13,6.86,13.0
6,Connecticut,2021-01-28,3565287,248765,7020,435075,264707,7.42,6.99,14.43
7,Delaware,2021-01-28,973764,76495,1075,106600,49369,5.07,7.86,12.94
8,District of Columbia,2021-01-28,705749,36132,902,75175,55108,7.81,5.13,12.94
9,Florida,2021-01-28,21477737,1687586,26034,2546050,1239975,5.77,7.87,13.65


In [112]:
nyt_avg_daily_cases = nytimes_nih_covid_df
nyt_avg_daily_cases.head()

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


In [113]:
nyt_avg_daily_cases = nyt_avg_daily_cases.groupby("date").sum()
nyt_avg_daily_cases.head()

Unnamed: 0_level_0,fips,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-21,53,1,0
2020-01-22,53,1,0
2020-01-23,53,1,0
2020-01-24,70,2,0
2020-01-25,76,3,0


In [114]:
nyt_avg_daily_cases["daily_new_cases"] = ""

In [116]:
nyt_avg_daily_cases.head()

Unnamed: 0_level_0,fips,cases,deaths,daily_new_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-21,53,1,0,
2020-01-22,53,1,0,
2020-01-23,53,1,0,
2020-01-24,70,2,0,
2020-01-25,76,3,0,


In [120]:
previous_day = 0

In [121]:
for index, row in nyt_avg_daily_cases.iterrows():
    new_cases = row["cases"] - previous_day
    previous_day = row["cases"]
    nyt_avg_daily_cases.at[index, "daily_new_cases"] = new_cases

In [122]:
nyt_avg_daily_cases["date"] = ""

In [123]:
nyt_avg_daily_cases

Unnamed: 0_level_0,fips,cases,deaths,daily_new_cases,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-21,53,1,0,1,
2020-01-22,53,1,0,0,
2020-01-23,53,1,0,0,
2020-01-24,70,2,0,1,
2020-01-25,76,3,0,1,
...,...,...,...,...,...
2021-01-24,1762,25177522,419207,129527,
2021-01-25,1762,25333204,421114,155682,
2021-01-26,1762,25484820,425211,151616,
2021-01-27,1762,25640449,429312,155629,


In [127]:
for index, row in nyt_avg_daily_cases.iterrows():
    date = index.date()
    print(date)
    
    nyt_avg_daily_cases.at[index, "date"] = date

2020-01-21
2020-01-22
2020-01-23
2020-01-24
2020-01-25
2020-01-26
2020-01-27
2020-01-28
2020-01-29
2020-01-30
2020-01-31
2020-02-01
2020-02-02
2020-02-03
2020-02-04
2020-02-05
2020-02-06
2020-02-07
2020-02-08
2020-02-09
2020-02-10
2020-02-11
2020-02-12
2020-02-13
2020-02-14
2020-02-15
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-02-20
2020-02-21
2020-02-22
2020-02-23
2020-02-24
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31
2020-04-01
2020-04-02
2020-04-03
2020-04-04
2020-04-05
2020-04-06
2020-04-07
2020-04-08
2020-04-09
2020-04-10
2020-04-11
2020-04-12
2020-04-13
2020-04-14
2020-04-15
2020-04-16
2020-04-17
2020-04-18
2020-04-19
2020-04-20

In [130]:
nyt_avg_daily_cases = nyt_avg_daily_cases.reset_index()

ValueError: cannot insert date, already exists

In [132]:
nyt_avg_daily_cases.reset_index(drop=True, inplace=True)

In [133]:
nyt_avg_daily_cases

Unnamed: 0,fips,cases,deaths,daily_new_cases,date
0,53,1,0,1,2020-01-21
1,53,1,0,0,2020-01-22
2,53,1,0,0,2020-01-23
3,70,2,0,1,2020-01-24
4,76,3,0,1,2020-01-25
...,...,...,...,...,...
369,1762,25177522,419207,129527,2021-01-24
370,1762,25333204,421114,155682,2021-01-25
371,1762,25484820,425211,151616,2021-01-26
372,1762,25640449,429312,155629,2021-01-27


In [134]:
nyt_avg_daily_cases.to_json("../data/daily_new_cases.json", orient="records")

In [None]:
nyt_master = pd.merge(nyt_master, state_pop_df, how ="inner", on="state")
nyt_master

In [None]:
nyt_master["est_percent_infected"] = ""
nyt_master

In [None]:
# nyt_master = nyt_master.groupby(["state", "date"]).sum()

In [None]:
# nyt_master = nyt_master.fillna(0)

In [None]:
nyt_master[nyt_master["population"] == 0]

In [None]:
# Calcuation for est percent infected by state
for index, row in nyt_master.iterrows():
    population = int(row["population"])
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    if infected == 0:
        percent_infected = 0
    else: 
        percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    nyt_master.at[index, "est_percent_infected"] = percent_infected

In [None]:
nyt_master

In [None]:
nyt_master.to_json("../data/nyt-master.json", orient="records")

## Upload data to AWS RDS

In [None]:
# Connect to AWS Database instance 
engine = create_engine(f'postgresql://uscovid:{password}@{endpoint}/us_covid_db')
connection = engine.connect()

In [None]:
# Add to AWS RDS (PostgreSQL)
master_df.to_sql('master_table', index=False, if_exists='replace', con=connection)

In [None]:
master_df.info()

In [None]:
# Export as json
master_df.to_json("../data/master-data.json", orient="records")

In [None]:
# Add to AWS RDS (PostgreSQL)
nytimes_nih_covid_df.to_sql('nyt_table', index=True, if_exists='replace', con=connection)