## State Population and Vaccination Data

In [1]:
# Import Pandas, Config Variables, and SQLalchemy
import pandas as pd
from config import endpoint, username, password
from sqlalchemy import create_engine

In [2]:
# Read CSVs for State Data
state_pop_df = pd.read_csv("../data/state_populations_cleaned.csv")
state_vacc_df = pd.read_csv("../data/state_covid_vaccinations_cleaned.csv")
state_latlons_df = pd.read_csv("../data/statelatlong.csv")

In [3]:
# State Populations
state_pop_df.head()

Unnamed: 0,state,population
0,Alabama,4903185
1,Alaska,731545
2,Arizona,7278717
3,Arkansas,3017804
4,California,39512223


In [4]:
# State Vaccinations
state_vacc_df.head()

Unnamed: 0,state,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
0,Alaska,256900,152874,35117.0,20897.0,110936,15165,41698,5700
1,Alabama,823600,490969,16797.0,10013.0,395196,8060,95033,1938
2,Arkansas,580775,404127,19245.0,13391.0,305917,10137,95202,3155
3,American Samoa,29450,12680,52883.0,22769.0,9792,17583,2885,5181
4,Arizona,1265950,863807,17392.0,11868.0,695970,9562,166798,2292


In [5]:
state_latlons_df.head()

Unnamed: 0,State,Latitude,Longitude,City
0,AL,32.601011,-86.680736,Alabama
1,AK,61.302501,-158.77502,Alaska
2,AZ,34.168219,-111.930907,Arizona
3,AR,34.751928,-92.131378,Arkansas
4,CA,37.271875,-119.270415,California


In [6]:
state_latlons_df = state_latlons_df.rename(columns={"State": "Abb", "City": "state"})

In [7]:
state_latlons_df

Unnamed: 0,Abb,Latitude,Longitude,state
0,AL,32.601011,-86.680736,Alabama
1,AK,61.302501,-158.77502,Alaska
2,AZ,34.168219,-111.930907,Arizona
3,AR,34.751928,-92.131378,Arkansas
4,CA,37.271875,-119.270415,California
5,CO,38.997934,-105.550567,Colorado
6,CT,41.518784,-72.757507,Connecticut
7,DE,39.145251,-75.418921,Delaware
8,DC,38.899349,-77.014567,District of Columbia
9,FL,27.975728,-83.833017,Florida


In [8]:
# State Data Merged
state_merged_df = pd.merge(state_pop_df, state_vacc_df, how="inner", on="state")
state_merged_df.head()

Unnamed: 0,state,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K
0,Alabama,4903185,823600,490969,16797.0,10013.0,395196,8060,95033,1938
1,Alaska,731545,256900,152874,35117.0,20897.0,110936,15165,41698,5700
2,Arizona,7278717,1265950,863807,17392.0,11868.0,695970,9562,166798,2292
3,Arkansas,3017804,580775,404127,19245.0,13391.0,305917,10137,95202,3155
4,California,39512223,7385225,4784478,18691.0,12109.0,3880253,9820,875340,2215


In [9]:
state_merged_df = pd.merge(state_merged_df, state_latlons_df, how="left", on="state")

In [10]:
state_merged_df

Unnamed: 0,state,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,Abb,Latitude,Longitude
0,Alabama,4903185,823600,490969,16797.0,10013.0,395196,8060,95033,1938,AL,32.601011,-86.680736
1,Alaska,731545,256900,152874,35117.0,20897.0,110936,15165,41698,5700,AK,61.302501,-158.77502
2,Arizona,7278717,1265950,863807,17392.0,11868.0,695970,9562,166798,2292,AZ,34.168219,-111.930907
3,Arkansas,3017804,580775,404127,19245.0,13391.0,305917,10137,95202,3155,AR,34.751928,-92.131378
4,California,39512223,7385225,4784478,18691.0,12109.0,3880253,9820,875340,2215,CA,37.271875,-119.270415
5,Colorado,5758736,1045000,757560,18146.0,13155.0,542165,9415,211328,3670,CO,38.997934,-105.550567
6,Connecticut,3565287,800225,554322,22445.0,15548.0,412601,11573,133196,3736,CT,41.518784,-72.757507
7,Delaware,973764,162525,127523,16690.0,13096.0,101383,10411,25405,2609,DE,39.145251,-75.418921
8,District of Columbia,705749,166950,108475,23656.0,15370.0,75351,10677,26957,3820,DC,38.899349,-77.014567
9,Florida,21477737,4217275,2731300,19636.0,12717.0,2023419,9421,694637,3234,FL,27.975728,-83.833017


In [11]:
# Set State as the Index
state_info_df = state_merged_df.set_index("state") 
state_info_df.head()

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,Abb,Latitude,Longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Alabama,4903185,823600,490969,16797.0,10013.0,395196,8060,95033,1938,AL,32.601011,-86.680736
Alaska,731545,256900,152874,35117.0,20897.0,110936,15165,41698,5700,AK,61.302501,-158.77502
Arizona,7278717,1265950,863807,17392.0,11868.0,695970,9562,166798,2292,AZ,34.168219,-111.930907
Arkansas,3017804,580775,404127,19245.0,13391.0,305917,10137,95202,3155,AR,34.751928,-92.131378
California,39512223,7385225,4784478,18691.0,12109.0,3880253,9820,875340,2215,CA,37.271875,-119.270415


In [12]:
# Add Empty Column for New Calculation
state_info_df["percent_vaccinated"] = ""
state_info_df.head(10)

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,Abb,Latitude,Longitude,percent_vaccinated
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama,4903185,823600,490969,16797.0,10013.0,395196,8060,95033,1938,AL,32.601011,-86.680736,
Alaska,731545,256900,152874,35117.0,20897.0,110936,15165,41698,5700,AK,61.302501,-158.77502,
Arizona,7278717,1265950,863807,17392.0,11868.0,695970,9562,166798,2292,AZ,34.168219,-111.930907,
Arkansas,3017804,580775,404127,19245.0,13391.0,305917,10137,95202,3155,AR,34.751928,-92.131378,
California,39512223,7385225,4784478,18691.0,12109.0,3880253,9820,875340,2215,CA,37.271875,-119.270415,
Colorado,5758736,1045000,757560,18146.0,13155.0,542165,9415,211328,3670,CO,38.997934,-105.550567,
Connecticut,3565287,800225,554322,22445.0,15548.0,412601,11573,133196,3736,CT,41.518784,-72.757507,
Delaware,973764,162525,127523,16690.0,13096.0,101383,10411,25405,2609,DE,39.145251,-75.418921,
District of Columbia,705749,166950,108475,23656.0,15370.0,75351,10677,26957,3820,DC,38.899349,-77.014567,
Florida,21477737,4217275,2731300,19636.0,12717.0,2023419,9421,694637,3234,FL,27.975728,-83.833017,


In [13]:
# Calcuation for percent vaccinated by state
for index, row in state_info_df.iterrows():
    population = row["population"]
    vaccinated = row["total_administered"]
    percent_vaccinated = round((vaccinated / population) * 100, 2)
    
    state_info_df.at[index, "percent_vaccinated"] = percent_vaccinated

In [14]:
# Check the data types
state_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   population                        51 non-null     int64  
 1   total_distributed                 51 non-null     int64  
 2   total_administered                51 non-null     int64  
 3   distributed_per_100K              51 non-null     float64
 4   administered_per_100K             51 non-null     float64
 5   people_with_1plus_doses           51 non-null     int64  
 6   people_with_1plus_doses_per_100K  51 non-null     int64  
 7   people_with_2_doses               51 non-null     int64  
 8   people_with_2_doses_per_100K      51 non-null     int64  
 9   Abb                               51 non-null     object 
 10  Latitude                          51 non-null     float64
 11  Longitude                         51 non-null     float64
 12  perc

In [15]:
# Convert percent vaccinated into a float
state_info_df = state_info_df.astype({"percent_vaccinated": 'float64'})
state_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   population                        51 non-null     int64  
 1   total_distributed                 51 non-null     int64  
 2   total_administered                51 non-null     int64  
 3   distributed_per_100K              51 non-null     float64
 4   administered_per_100K             51 non-null     float64
 5   people_with_1plus_doses           51 non-null     int64  
 6   people_with_1plus_doses_per_100K  51 non-null     int64  
 7   people_with_2_doses               51 non-null     int64  
 8   people_with_2_doses_per_100K      51 non-null     int64  
 9   Abb                               51 non-null     object 
 10  Latitude                          51 non-null     float64
 11  Longitude                         51 non-null     float64
 12  perc

In [16]:
state_info_df.head(10)

Unnamed: 0_level_0,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,Abb,Latitude,Longitude,percent_vaccinated
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama,4903185,823600,490969,16797.0,10013.0,395196,8060,95033,1938,AL,32.601011,-86.680736,10.01
Alaska,731545,256900,152874,35117.0,20897.0,110936,15165,41698,5700,AK,61.302501,-158.77502,20.9
Arizona,7278717,1265950,863807,17392.0,11868.0,695970,9562,166798,2292,AZ,34.168219,-111.930907,11.87
Arkansas,3017804,580775,404127,19245.0,13391.0,305917,10137,95202,3155,AR,34.751928,-92.131378,13.39
California,39512223,7385225,4784478,18691.0,12109.0,3880253,9820,875340,2215,CA,37.271875,-119.270415,12.11
Colorado,5758736,1045000,757560,18146.0,13155.0,542165,9415,211328,3670,CO,38.997934,-105.550567,13.15
Connecticut,3565287,800225,554322,22445.0,15548.0,412601,11573,133196,3736,CT,41.518784,-72.757507,15.55
Delaware,973764,162525,127523,16690.0,13096.0,101383,10411,25405,2609,DE,39.145251,-75.418921,13.1
District of Columbia,705749,166950,108475,23656.0,15370.0,75351,10677,26957,3820,DC,38.899349,-77.014567,15.37
Florida,21477737,4217275,2731300,19636.0,12717.0,2023419,9421,694637,3234,FL,27.975728,-83.833017,12.72


## NIH AWS Data / NYT

### Set up State Master DF

In [20]:
# Read NYT / NIH json 
nytimes_nih_covid_df = pd.read_json("../data/daily_covid_stats_by_state_nyt.json", lines=True)

In [21]:
# Get Data for Most Recent Provided Date
most_recent_date = "2021-02-08"
latest_nyt_stateData = nytimes_nih_covid_df[nytimes_nih_covid_df['date'] == f'{most_recent_date}']

In [22]:
latest_nyt_stateData.head(10)

Unnamed: 0,date,state,fips,cases,deaths
18824,2021-02-08,Alabama,1,473348,8523
18825,2021-02-08,Alaska,2,55374,269
18826,2021-02-08,Arizona,4,783229,14055
18827,2021-02-08,Arkansas,5,307373,5106
18828,2021-02-08,California,6,3430685,44440
18829,2021-02-08,Colorado,8,408793,5832
18830,2021-02-08,Connecticut,9,263739,7282
18831,2021-02-08,Delaware,10,80931,1208
18832,2021-02-08,District of Columbia,11,38136,956
18833,2021-02-08,Florida,12,1783712,27814


In [23]:
# Merge with State Info DF that contains population and vaccinations
latest_data_merged = pd.merge(state_info_df, latest_nyt_stateData, on="state")
latest_data_merged.head(10)

Unnamed: 0,state,population,total_distributed,total_administered,distributed_per_100K,administered_per_100K,people_with_1plus_doses,people_with_1plus_doses_per_100K,people_with_2_doses,people_with_2_doses_per_100K,Abb,Latitude,Longitude,percent_vaccinated,date,fips,cases,deaths
0,Alabama,4903185,823600,490969,16797.0,10013.0,395196,8060,95033,1938,AL,32.601011,-86.680736,10.01,2021-02-08,1,473348,8523
1,Alaska,731545,256900,152874,35117.0,20897.0,110936,15165,41698,5700,AK,61.302501,-158.77502,20.9,2021-02-08,2,55374,269
2,Arizona,7278717,1265950,863807,17392.0,11868.0,695970,9562,166798,2292,AZ,34.168219,-111.930907,11.87,2021-02-08,4,783229,14055
3,Arkansas,3017804,580775,404127,19245.0,13391.0,305917,10137,95202,3155,AR,34.751928,-92.131378,13.39,2021-02-08,5,307373,5106
4,California,39512223,7385225,4784478,18691.0,12109.0,3880253,9820,875340,2215,CA,37.271875,-119.270415,12.11,2021-02-08,6,3430685,44440
5,Colorado,5758736,1045000,757560,18146.0,13155.0,542165,9415,211328,3670,CO,38.997934,-105.550567,13.15,2021-02-08,8,408793,5832
6,Connecticut,3565287,800225,554322,22445.0,15548.0,412601,11573,133196,3736,CT,41.518784,-72.757507,15.55,2021-02-08,9,263739,7282
7,Delaware,973764,162525,127523,16690.0,13096.0,101383,10411,25405,2609,DE,39.145251,-75.418921,13.1,2021-02-08,10,80931,1208
8,District of Columbia,705749,166950,108475,23656.0,15370.0,75351,10677,26957,3820,DC,38.899349,-77.014567,15.37,2021-02-08,11,38136,956
9,Florida,21477737,4217275,2731300,19636.0,12717.0,2023419,9421,694637,3234,FL,27.975728,-83.833017,12.72,2021-02-08,12,1783712,27814


In [24]:
# Create a Master Dataframe to start working with
state_master_df = latest_data_merged[['state', 'Abb', 'date', 'population', 'Latitude', 'Longitude', 'cases', 'deaths', 'total_distributed', 'total_administered', 'percent_vaccinated' ]]
state_master_df.head(10)

Unnamed: 0,state,Abb,date,population,Latitude,Longitude,cases,deaths,total_distributed,total_administered,percent_vaccinated
0,Alabama,AL,2021-02-08,4903185,32.601011,-86.680736,473348,8523,823600,490969,10.01
1,Alaska,AK,2021-02-08,731545,61.302501,-158.77502,55374,269,256900,152874,20.9
2,Arizona,AZ,2021-02-08,7278717,34.168219,-111.930907,783229,14055,1265950,863807,11.87
3,Arkansas,AR,2021-02-08,3017804,34.751928,-92.131378,307373,5106,580775,404127,13.39
4,California,CA,2021-02-08,39512223,37.271875,-119.270415,3430685,44440,7385225,4784478,12.11
5,Colorado,CO,2021-02-08,5758736,38.997934,-105.550567,408793,5832,1045000,757560,13.15
6,Connecticut,CT,2021-02-08,3565287,41.518784,-72.757507,263739,7282,800225,554322,15.55
7,Delaware,DE,2021-02-08,973764,39.145251,-75.418921,80931,1208,162525,127523,13.1
8,District of Columbia,DC,2021-02-08,705749,38.899349,-77.014567,38136,956,166950,108475,15.37
9,Florida,FL,2021-02-08,21477737,27.975728,-83.833017,1783712,27814,4217275,2731300,12.72


In [25]:
# Create empty column to calculate estimated percent infected to date
state_master_df["est_percent_infected_to_date"] = ""
state_master_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,state,Abb,date,population,Latitude,Longitude,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date
0,Alabama,AL,2021-02-08,4903185,32.601011,-86.680736,473348,8523,823600,490969,10.01,
1,Alaska,AK,2021-02-08,731545,61.302501,-158.77502,55374,269,256900,152874,20.9,
2,Arizona,AZ,2021-02-08,7278717,34.168219,-111.930907,783229,14055,1265950,863807,11.87,
3,Arkansas,AR,2021-02-08,3017804,34.751928,-92.131378,307373,5106,580775,404127,13.39,
4,California,CA,2021-02-08,39512223,37.271875,-119.270415,3430685,44440,7385225,4784478,12.11,
5,Colorado,CO,2021-02-08,5758736,38.997934,-105.550567,408793,5832,1045000,757560,13.15,
6,Connecticut,CT,2021-02-08,3565287,41.518784,-72.757507,263739,7282,800225,554322,15.55,
7,Delaware,DE,2021-02-08,973764,39.145251,-75.418921,80931,1208,162525,127523,13.1,
8,District of Columbia,DC,2021-02-08,705749,38.899349,-77.014567,38136,956,166950,108475,15.37,
9,Florida,FL,2021-02-08,21477737,27.975728,-83.833017,1783712,27814,4217275,2731300,12.72,


In [26]:
# Calcuation for est percent infected by state
for index, row in state_master_df.iterrows():
    population = row["population"]
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    state_master_df.at[index, "est_percent_infected_to_date"] = percent_infected

In [27]:
state_master_df.head(10)

Unnamed: 0,state,Abb,date,population,Latitude,Longitude,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date
0,Alabama,AL,2021-02-08,4903185,32.601011,-86.680736,473348,8523,823600,490969,10.01,9.67
1,Alaska,AK,2021-02-08,731545,61.302501,-158.77502,55374,269,256900,152874,20.9,7.57
2,Arizona,AZ,2021-02-08,7278717,34.168219,-111.930907,783229,14055,1265950,863807,11.87,10.78
3,Arkansas,AR,2021-02-08,3017804,34.751928,-92.131378,307373,5106,580775,404127,13.39,10.2
4,California,CA,2021-02-08,39512223,37.271875,-119.270415,3430685,44440,7385225,4784478,12.11,8.69
5,Colorado,CO,2021-02-08,5758736,38.997934,-105.550567,408793,5832,1045000,757560,13.15,7.11
6,Connecticut,CT,2021-02-08,3565287,41.518784,-72.757507,263739,7282,800225,554322,15.55,7.41
7,Delaware,DE,2021-02-08,973764,39.145251,-75.418921,80931,1208,162525,127523,13.1,8.32
8,District of Columbia,DC,2021-02-08,705749,38.899349,-77.014567,38136,956,166950,108475,15.37,5.41
9,Florida,FL,2021-02-08,21477737,27.975728,-83.833017,1783712,27814,4217275,2731300,12.72,8.32


In [28]:
# Add Estimated Percent Immune Column
state_master_df["est_percent_immune"] = ""
state_master_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,state,Abb,date,population,Latitude,Longitude,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date,est_percent_immune
0,Alabama,AL,2021-02-08,4903185,32.601011,-86.680736,473348,8523,823600,490969,10.01,9.67,
1,Alaska,AK,2021-02-08,731545,61.302501,-158.77502,55374,269,256900,152874,20.9,7.57,
2,Arizona,AZ,2021-02-08,7278717,34.168219,-111.930907,783229,14055,1265950,863807,11.87,10.78,
3,Arkansas,AR,2021-02-08,3017804,34.751928,-92.131378,307373,5106,580775,404127,13.39,10.2,
4,California,CA,2021-02-08,39512223,37.271875,-119.270415,3430685,44440,7385225,4784478,12.11,8.69,
5,Colorado,CO,2021-02-08,5758736,38.997934,-105.550567,408793,5832,1045000,757560,13.15,7.11,
6,Connecticut,CT,2021-02-08,3565287,41.518784,-72.757507,263739,7282,800225,554322,15.55,7.41,
7,Delaware,DE,2021-02-08,973764,39.145251,-75.418921,80931,1208,162525,127523,13.1,8.32,
8,District of Columbia,DC,2021-02-08,705749,38.899349,-77.014567,38136,956,166950,108475,15.37,5.41,
9,Florida,FL,2021-02-08,21477737,27.975728,-83.833017,1783712,27814,4217275,2731300,12.72,8.32,


In [29]:
# Calcuation for estimated immune by state
for index, row in state_master_df.iterrows():
    population = row["population"]
    infected = row["cases"]
    deaths = row["deaths"]
    vaccinated = row["total_administered"]
    
    est_total_immune = infected + vaccinated
    est_population = population - deaths
    
    percent_immune = round((est_total_immune / est_population) * 100, 2)
    
    state_master_df.at[index, "est_percent_immune"] = percent_immune

In [30]:
state_master_df.head(10)

Unnamed: 0,state,Abb,date,population,Latitude,Longitude,cases,deaths,total_distributed,total_administered,percent_vaccinated,est_percent_infected_to_date,est_percent_immune
0,Alabama,AL,2021-02-08,4903185,32.601011,-86.680736,473348,8523,823600,490969,10.01,9.67,19.7
1,Alaska,AK,2021-02-08,731545,61.302501,-158.77502,55374,269,256900,152874,20.9,7.57,28.48
2,Arizona,AZ,2021-02-08,7278717,34.168219,-111.930907,783229,14055,1265950,863807,11.87,10.78,22.67
3,Arkansas,AR,2021-02-08,3017804,34.751928,-92.131378,307373,5106,580775,404127,13.39,10.2,23.62
4,California,CA,2021-02-08,39512223,37.271875,-119.270415,3430685,44440,7385225,4784478,12.11,8.69,20.81
5,Colorado,CO,2021-02-08,5758736,38.997934,-105.550567,408793,5832,1045000,757560,13.15,7.11,20.27
6,Connecticut,CT,2021-02-08,3565287,41.518784,-72.757507,263739,7282,800225,554322,15.55,7.41,22.99
7,Delaware,DE,2021-02-08,973764,39.145251,-75.418921,80931,1208,162525,127523,13.1,8.32,21.43
8,District of Columbia,DC,2021-02-08,705749,38.899349,-77.014567,38136,956,166950,108475,15.37,5.41,20.8
9,Florida,FL,2021-02-08,21477737,27.975728,-83.833017,1783712,27814,4217275,2731300,12.72,8.32,21.05


### Set up Daily Cases DF

In [31]:
nyt_avg_daily_cases = nytimes_nih_covid_df
nyt_avg_daily_cases.head()

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


In [32]:
nyt_avg_daily_cases = nyt_avg_daily_cases.groupby("date").sum()
nyt_avg_daily_cases.head()

Unnamed: 0_level_0,fips,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-21,53,1,0
2020-01-22,53,1,0
2020-01-23,53,1,0
2020-01-24,70,2,0
2020-01-25,76,3,0


In [33]:
nyt_avg_daily_cases["daily_new_cases"] = ""

In [34]:
nyt_avg_daily_cases.head()

Unnamed: 0_level_0,fips,cases,deaths,daily_new_cases
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-21,53,1,0,
2020-01-22,53,1,0,
2020-01-23,53,1,0,
2020-01-24,70,2,0,
2020-01-25,76,3,0,


In [35]:
previous_day = 0

In [36]:
for index, row in nyt_avg_daily_cases.iterrows():
    new_cases = row["cases"] - previous_day
    previous_day = row["cases"]
    nyt_avg_daily_cases.at[index, "daily_new_cases"] = new_cases

In [37]:
nyt_avg_daily_cases["date"] = ""

In [38]:
nyt_avg_daily_cases

Unnamed: 0_level_0,fips,cases,deaths,daily_new_cases,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-21,53,1,0,1,
2020-01-22,53,1,0,0,
2020-01-23,53,1,0,0,
2020-01-24,70,2,0,1,
2020-01-25,76,3,0,1,
...,...,...,...,...,...
2021-02-04,1762,26722382,455805,125842,
2021-02-05,1762,26851974,459375,129592,
2021-02-06,1762,26957001,462037,105027,
2021-02-07,1762,27044335,463338,87334,


In [39]:
for index, row in nyt_avg_daily_cases.iterrows():
    date = index.date()
    nyt_avg_daily_cases.at[index, "date"] = date

In [40]:
nyt_avg_daily_cases.reset_index(drop=True, inplace=True)

In [41]:
nyt_avg_daily_cases.head(10)

Unnamed: 0,fips,cases,deaths,daily_new_cases,date
0,53,1,0,1,2020-01-21
1,53,1,0,0,2020-01-22
2,53,1,0,0,2020-01-23
3,70,2,0,1,2020-01-24
4,76,3,0,1,2020-01-25
5,80,5,0,2,2020-01-26
6,80,5,0,0,2020-01-27
7,80,5,0,0,2020-01-28
8,80,5,0,0,2020-01-29
9,80,6,0,1,2020-01-30


### Set Up NYT Totals Master DF

In [42]:
nyt_master = pd.merge(nytimes_nih_covid_df, state_pop_df, how ="inner", on="state")
nyt_master.head()

Unnamed: 0,date,state,fips,cases,deaths,population
0,2020-01-21,Washington,53,1,0,7614893
1,2020-01-22,Washington,53,1,0,7614893
2,2020-01-23,Washington,53,1,0,7614893
3,2020-01-24,Washington,53,1,0,7614893
4,2020-01-25,Washington,53,1,0,7614893


In [43]:
nyt_master["est_percent_infected"] = ""
nyt_master.head()

Unnamed: 0,date,state,fips,cases,deaths,population,est_percent_infected
0,2020-01-21,Washington,53,1,0,7614893,
1,2020-01-22,Washington,53,1,0,7614893,
2,2020-01-23,Washington,53,1,0,7614893,
3,2020-01-24,Washington,53,1,0,7614893,
4,2020-01-25,Washington,53,1,0,7614893,


In [44]:
# Calcuation for est percent infected by state
for index, row in nyt_master.iterrows():
    population = int(row["population"])
    deaths = row["deaths"]
    infected = row["cases"]
    
    # take out the number who have passed away to reset estimated population
    est_population_new = population - deaths 
    
    # estimate the percent infected
    if infected == 0:
        percent_infected = 0
    else: 
        percent_infected = round((infected / est_population_new) * 100, 2)
    
    # insert into percent_infected column
    nyt_master.at[index, "est_percent_infected"] = percent_infected

In [45]:
nyt_master.head()

Unnamed: 0,date,state,fips,cases,deaths,population,est_percent_infected
0,2020-01-21,Washington,53,1,0,7614893,0
1,2020-01-22,Washington,53,1,0,7614893,0
2,2020-01-23,Washington,53,1,0,7614893,0
3,2020-01-24,Washington,53,1,0,7614893,0
4,2020-01-25,Washington,53,1,0,7614893,0


## JSON Exports

In [46]:
# Master DF to JSON
state_master_df.to_json("../data/state-master-data.json", orient="records")

In [47]:
# NYT Master to JSON 
nyt_master.to_json("../data/nyt-master.json", orient="records")

In [48]:
nyt_avg_daily_cases.to_json("../data/daily_new_cases.json", orient="records")

### CSV Exports

In [49]:
state_master_df.to_csv("../data/state-master-data.csv")

## Amazon RDS Updates

In [50]:
# Connect to AWS Database instance 
engine = create_engine(f'postgresql://uscovid:{password}@{endpoint}/us_covid_db')
connection = engine.connect()

In [51]:
# 50 States & D.C. Data
state_master_df.to_sql('master_table', index=False, if_exists='replace', con=connection)

In [52]:
# Daily Cases Data
nyt_avg_daily_cases.to_sql("daily_new_cases", index=False, if_exists='replace', con=connection)

In [None]:
# Raw NTY Data
nytimes_nih_covid_df.to_sql('nyt_table', index=True, if_exists='replace', con=connection)