# A Comparison of State Use of SFLRF Funds for Vaccination Programs and Vaccination Rates in Each State



### Data Sources:
CDC - "COVID-19 Vaccinations in the United States, Jurisdiction"
csv downloaded 5/11/23
https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-Jurisdi/unsk-b7fc

EARN/EPI - "EARN SLFRF Workbook for Q4 2022" compiled by Dave Kamper of the Economic Policy Institute (dkamper@epi.org) from Treasury reports by states and local jurisidictions who received funding, and other data sources as detailed in the workbook.

In [None]:
# import dependencies and setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
from pathlib import Path


## Production Code (Team: Put your code here after it is complete and ready to go)

## Evan Work Area

In [None]:
# import dependencies and setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
from pathlib import Path

In [4]:
# Load csv file(s)
all_states_sheet = Path("Resources/EARN_all_states.csv")


# Read csv file(s) as a DataFrame
all_states_df = pd.read_csv(all_states_sheet)


# merge csv files (if applicable)


# preview the raw DataFrame
all_states_df.head()

In [None]:
# Brainstorm a list of words to filter the 'Project Description' column by.
    ## this list will be used to filter that column so that we are only working with projects that
    ## are actually covid related.
    
search_term_list = #[covid, vaccine, ...]


In [None]:
# Filter the dataframe column 'Project Description'

vaccine_projects_df = #all_states_df.<filter function syntax here> ...if ['Project Description'] == search_term_list

# preview the 'filtered' dataframe:
vaccine_projects_df.head()


In [None]:
# Try to group the filtered dataframe by state, summing applicable $ value columns
    ## if we get errors, then we need to clean columns causing errors. 
    ## eg) 'Adopted Budget' column has values containing "-". This might prevent the .sum() function from working

state_spending_df = # vaccine_projects_df.groupby(['State/Territory']).sum(['Adopted Budget', 'Total Cumulative Obligations', 'Total Cumulative Expenditures'])



## Data Exploration and Cleanup:
- Describe here the group's data sets and how they were cleaned for analysis

# Greg Work Area

### CDC Data

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns

In [None]:
#Import vaccination data from csv
vac_df = pd.read_csv('COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv')

In [None]:
#function formats the CDC dataframe for US jursdictions - see below for input formats
def CDC_format(df,key,date,add_str,drop_values): #key, filt -> str; length -> int; columns, drop_values -> list
    df = df.dropna(subset=[key])
    df = df[vac_df['Date'] == date]
    df = df[~df[key].isin(drop_values)]
    df[key] = add_str + vac_df[key].astype(str)
    df.reset_index(drop = True, inplace = True)
    return df

In [None]:
drop = ['DD2','FM','AS','VI','BP2','IH2','GU','PN','PR','VA2','PW','US','MP','MH']
vac_df = CDC_format(vac_df,'Location','12/28/2022','US_',drop)

In [None]:
vac_df

In [None]:
### Google vac data

In [None]:
#Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns
import cartopy.crs as ccrs
import geoviews as gv # noqa
import pyproj
import geopandas as gpd
import hvplot.pandas
import plotly.express as px

In [None]:
#Import vaccination data from google api
vac_df = pd.read_csv('https://storage.googleapis.com/covid19-open-data/v3/latest/vaccinations.csv')

In [None]:
#function formats the google dataframes - see below for input formats
def google_format(df,key,filt,length,columns,drop_values): #key, filt -> str; length -> int; columns, drop_values -> list
    df = df.dropna(subset=[key])
    df = df[df[key].str.contains(filt)]
    mask = (df[key].str.len() == length)
    df = df.loc[mask]
    df = df[columns]
    df = df[~df[key].isin(drop_values)]
    df.reset_index(drop = True, inplace = True)
    return df

In [None]:
#Input values for vaccination data
drop = ['US_AS','US_GU','US_MP','US_PR','US_VI']
cols = ['date','location_key','cumulative_persons_fully_vaccinated','new_persons_vaccinated','new_persons_fully_vaccinated']
loc_key = 'location_key'
contains = 'US_'

In [None]:
#formatting vaccination data
vac_df = google_format(vac_df, loc_key, contains, 5, cols, drop)

In [None]:
mylist = ['Orange','Apple'] #Keywords search
pattern = '|'.join(mylist)
vac_df.location_key.str.contains(pattern)

In [None]:
#reading demographic data
dem_df = pd.read_csv('demographics.csv')

In [None]:
dem_df

In [None]:
dcols = ['location_key','population']

In [None]:
#formatting demographic data
dem_df = google_format(dem_df, loc_key, contains, 5, dcols, drop)

In [None]:
#reading epidemeology data
epi_df = pd.read_csv('https://storage.googleapis.com/covid19-open-data/v3/latest/epidemiology.csv')

In [None]:
ecols = ['location_key','cumulative_confirmed','cumulative_deceased','cumulative_recovered']

In [None]:
#formatting epidemeology data
epi_df = google_format(epi_df, loc_key, contains, 5, ecols, drop)

In [None]:
loc_key = pd.read_csv('https://storage.googleapis.com/covid19-open-data/v3/location/US.csv')

In [None]:
AK_vac_df = US_vac_df[US_vac_df['location_key'].str.contains('US_AK')]

In [None]:
#Looking at only one state - this can be skipped
AK_total = AK_vac_df['cumulative_persons_fully_vaccinated'].iloc[1:len(AK_vac_df)].sum()
AK_total

In [None]:
#we don't need this at the moment, can be skipped
def swap_rows(df, i1, i2): #Keep this!!!
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

In [None]:
#merging dataframes
total_df = vac_df.merge(dem_df, how = 'inner',on = 'location_key')

In [None]:

total_df['percent_fully_vaccinated'] = (total_df['cumulative_persons_fully_vaccinated']/total_df['population'])*100
total_df.sort_values('percent_fully_vaccinated', ascending = False)

In [None]:
#merging dataframes
total_df = total_df.merge(epi_df, how = 'inner',on = 'location_key')

In [None]:
total_df['percent_death_rate_by_case'] = (total_df['cumulative_deceased']/total_df['cumulative_confirmed'])*100

In [None]:
total_df['percent_death_rate_per_capita'] = (total_df['cumulative_deceased']/total_df['population'])*100

In [None]:
total_df['percent_confirmed'] = (total_df['cumulative_confirmed']/total_df['population'])*100

In [None]:
total_df['state_code'] = total_df.location_key.str.replace('US_','') #adding the state code for the plotly function

In [None]:
total_df.sort_values('percent_fully_vaccinated', ascending = False)

In [None]:
#function for regression plots
def reg(df,x,y,x_text,y_text):    
    lm = st.linregress(x = df[x], y = df[y])
    data_fit = lm[0]*df[x] + lm[1]
    fit_df = pd.DataFrame({'x': df[x], 'fitted': data_fit})
    ax = sns.scatterplot(data = df, x = x, y = y)
    #ax = df.plot.scatter(y = y, x = x, s = 30)
    print(f"The r-value is: {lm[2]}")
    fit_df.plot.line(x = 'x', y = 'fitted', color = 'red', ax=ax, legend = None, xlabel = x)
    plt.text(x_text,y_text,f"y = {'%.2f' %lm[0]}x + {'%.1f' %lm[1]}", color = 'red', fontsize = 16)

In [None]:
reg(total_df,'percent_fully_vaccinated','percent_death_rate_by_case',50,0.6)

In [None]:
reg(total_df,'percent_fully_vaccinated','percent_death_rate_per_capita',50,0.15)

In [None]:
reg(total_df,'percent_fully_vaccinated','percent_confirmed',50,20)

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

world.hvplot(c='country', geo=True)

In [None]:
#generating map of us states - you need to specify the color variable as one of the dataframe columns 
fig = px.choropleth(total_df,
                    locations='state_code', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='percent_death_rate_per_capita',
                    color_continuous_scale="blues" 
                    )
# fig.add_scattergeo(
#     locations=total_df['state_code'],
#     locationmode="USA-states", 
#     text=total_df['state_code'],
#     mode='text',
# )
fig.show()

# Joanna Work Area

In [1]:
#putting Greg's code down here so I can run my area independently of the rest of the sheet without error
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
#Import vaccination data from csv
vac_df = pd.read_csv('Resources/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv')

  vac_df = pd.read_csv('Resources/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv')


In [3]:
#function formats the CDC dataframe for US jursdictions - see below for input formats
def CDC_format(df,key,date,add_str,drop_values): #key, filt -> str; length -> int; columns, drop_values -> list
    df = df.dropna(subset=[key])
    df = df[vac_df['Date'] == date]
    df = df[~df[key].isin(drop_values)]
    df[key] = add_str + vac_df[key].astype(str)
    df.reset_index(drop = True, inplace = True)
    return df

In [4]:
#drop non-state territories from dataframe, select only rows with 12/28/22 data
drop = ['DD2','FM','AS','VI','BP2','IH2','GU','PN','PR','VA2','PW','US','MP','MH']
vac_df = CDC_format(vac_df,'Location','12/28/2022','US_',drop)

In [5]:
#change location to match state code
vac_df['Location'] = vac_df['Location'].str.replace('US_', '')

## To do list
Calculate population number they are using for each state and use it to calculate the Pop_Pct for Administered_Bivalent column

Compare Administered to Recip_Administered to see if there are any significant differences in any state

Make some smaller dataframes for viewing:

a) Whole pop with Distrib, Administered, Dose1, Series Complete, Additional Doses, Second Booster, Administered Bivalent

b) Each individual age group with Dose1, Series Complete, Additional Doses, Second Booster, Bivalent Booster

c) Each category (Dose1, Series Complete, Additional Doses, Second Booster, Bivalent Booster) with all age ranges

Identify which states have a high variance from the mean (general/nationwide population) in % vaccinated (looking at all dosage categories and age categories). This will show us which states were the "good vaccinators" and which the "poor vaccinators." We can then use the EARN data to see if this correlates to how much of the federal money they spent, how many vaccination projects they did, etc.


In [6]:
# get all the columns we will be interested in into one dataframe
# NOTE: there is no Pop_Pct column for the administered_bivalent, and second_booster only for the age breakouts
# but we can extrapolate from their other population calculations to calculate these. For second_booster to get state numbers
# we have to add up the vaccines from the different manufacturers because we don't have them already summed.

vac_df = pd.DataFrame(data=vac_df, columns=["Location", "Distributed", "Administered", "Recip_Administered",
                                                   "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                                                   "Administered_Dose1_Recip_5Plus", "Administered_Dose1_Recip_5PlusPop_Pct",
                                                  "Administered_Dose1_Recip_12Plus", "Administered_Dose1_Recip_12PlusPop_Pct",
                                                  "Administered_Dose1_Recip_18Plus", "Administered_Dose1_Recip_18PlusPop_Pct",
                                                  "Administered_Dose1_Recip_65Plus", "Administered_Dose1_Recip_65PlusPop_Pct",
                                                  "Series_Complete_Yes", "Series_Complete_Pop_Pct", "Series_Complete_5Plus",
                                                  "Series_Complete_12Plus", "Series_Complete_12PlusPop_Pct",
                                                   "Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
                                                   "Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct", "Additional_Doses",
                                                   "Additional_Doses_Vax_Pct", "Additional_Doses_5Plus",
                                                   "Additional_Doses_5Plus_Vax_Pct", "Additional_Doses_12Plus",
                                                   "Additional_Doses_12Plus_Vax_Pct", "Additional_Doses_18Plus",
                                                   "Additional_Doses_18Plus_Vax_Pct", "Additional_Doses_50Plus",
                                                   "Additional_Doses_50Plus_Vax_Pct", "Additional_Doses_65Plus",
                                                   "Additional_Doses_65Plus_Vax_Pct", "Second_Booster_50Plus",
                                                   "Second_Booster_50Plus_Vax_Pct", "Second_Booster_65Plus",
                                                   "Second_Booster_65Plus_Vax_Pct", "Second_Booster_Janssen",
                                                    "Second_Booster_Moderna", "Second_Booster_Pfizer",
                                                    "Second_Booster_Unk_Manuf", "Administered_Bivalent",
                                                   "Bivalent_Booster_5Plus", "Bivalent_Booster_5Plus_Pop_Pct",
                                                   "Bivalent_Booster_12Plus", "Bivalent_Booster_12Plus_Pop_Pct",
                                                   "Bivalent_Booster_18Plus", "Bivalent_Booster_18Plus_Pop_Pct"])

In [7]:
vac_all_ages_df = pd.DataFrame(data=vac_df, columns=["Location", "Distributed", "Administered", "Recip_Administered",
                                                   "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                                                    "Series_Complete_Yes", "Series_Complete_Pop_Pct",
                                                     "Additional_Doses", "Additional_Doses_Vax_Pct", "Administered_Bivalent"])

vac_all_ages_df

Unnamed: 0,Location,Distributed,Administered,Recip_Administered,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct,Series_Complete_Yes,Series_Complete_Pop_Pct,Additional_Doses,Additional_Doses_Vax_Pct,Administered_Bivalent
0,CT,11421135,8883525,8933212,3636096,95.0,2950962,82.8,1628555,55.2,782715
1,NJ,28223715,19503839,20082553,8372496,94.3,7001577,78.8,3611932,51.6,1241078
2,OK,9308930,6660547,6642791,2940863,74.3,2383051,60.2,986011,41.4,399410
3,NE,5229080,3735602,3754048,1414102,73.1,1276735,66.0,703721,55.1,284855
4,DE,3169595,2120412,2094422,853776,87.7,709932,72.9,361595,50.9,171972
5,ME,4718980,3458390,3463706,1302731,95.0,1115904,83.0,675930,60.6,365193
6,HI,4448760,3451416,3486162,1289826,91.1,1150629,81.3,681357,59.2,264138
7,AZ,19039550,14302488,13939670,5610927,77.1,4788266,65.8,2372435,49.5,957105
8,KY,11589155,7392531,7478240,3064611,68.6,2653934,59.4,1278111,48.2,485457
9,VT,2444660,1718359,1689315,617699,95.0,532166,85.3,351797,66.1,184091


In [8]:
# Checking on second booster columns -- these NaN values actually exist in the spreadsheet. Is there something going on with
# the function that was used to create the initial dataframe?
vac_secondbooster_df = pd.DataFrame(data=vac_df, columns=["Location", "Second_Booster_50Plus",
                                                   "Second_Booster_50Plus_Vax_Pct", "Second_Booster_65Plus",
                                                   "Second_Booster_65Plus_Vax_Pct", "Second_Booster_Janssen",
                                                    "Second_Booster_Moderna", "Second_Booster_Pfizer",
                                                    "Second_Booster_Unk_Manuf"])
vac_secondbooster_df

Unnamed: 0,Location,Second_Booster_50Plus,Second_Booster_50Plus_Vax_Pct,Second_Booster_65Plus,Second_Booster_65Plus_Vax_Pct,Second_Booster_Janssen,Second_Booster_Moderna,Second_Booster_Pfizer,Second_Booster_Unk_Manuf
0,CT,515876,54.0,315493,63.6,116,292804,364453,20
1,NJ,910435,44.9,555822,53.7,906,509117,668184,72
2,OK,281259,43.0,195510,49.9,173,148602,186591,1335
3,NE,229578,55.5,150927,63.4,124,100170,195655,547
4,DE,125388,52.0,85585,60.0,92,61803,85502,28
5,ME,266208,61.9,175073,71.1,117,134894,195018,304
6,HI,228520,58.2,143958,66.3,123,136079,150361,40
7,AZ,749424,51.8,499474,58.9,797,389794,545083,2542
8,KY,395911,47.1,263917,54.6,317,202578,273639,139
9,VT,126163,61.4,79807,70.4,65,59385,109100,2


# Kendal Work Area

In [None]:
#putting Greg's code down here so I can run my area independently of the rest of the sheet without error
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
#Import vaccination data from csv
vac_df = pd.read_csv('COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv')

In [None]:
#function formats the CDC dataframe for US jursdictions - see below for input formats
def CDC_format(df,key,date,add_str,drop_values): #key, filt -> str; length -> int; columns, drop_values -> list
    df = df.dropna(subset=[key])
    df = df[vac_df['Date'] == date]
    df = df[~df[key].isin(drop_values)]
    df[key] = add_str + vac_df[key].astype(str)
    df.reset_index(drop = True, inplace = True)
    return df

In [None]:
#drop non-state territories from dataframe
drop = ['DD2','FM','AS','VI','BP2','IH2','GU','PN','PR','VA2','PW','US','MP','MH']
vac_df = CDC_format(vac_df,'Location','12/28/2022','US_',drop)

In [None]:
#change location to match state code for choropleth maps
vac_df['Location'] = vac_df['Location'].str.replace('US_', '')
vac_df = vac_df.rename(columns={'Series_Complete_Pop_Pct':'Percentage of Population Fully Vaccinated',
                            'Administered_Dose1_Pop_Pct':'% of Population with at least 1 dose',
                            'Series_Complete_65PlusPop_Pct': '% of Population Fully Vaccinated (65+)',
                            'Bivalent_Booster_65Plus_Pop_Pct': '% of Population with bivalent booster (65+)'}) 

In [None]:
#Go.choropleth method (https://plotly.com/python/choropleth-maps/)
fig = go.Figure(data=go.Choropleth(
    locations=vac_df['Location'], # Spatial coordinates
    z = vac_df['Percentage of Population Fully Vaccinated'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Viridis',
    colorbar_title = "Percentage of Population Fully Vaccinated",
))

fig.update_layout(
    title_text = 'Vaccination Rates by State',
    geo_scope='usa', # limite map scope to USA
)

fig.show()

In [None]:
fig_complete_total_pop = px.choropleth(vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Percentage of Population Fully Vaccinated',
                    color_continuous_scale="aggrnyl",
                    )

In [None]:
fig_at_least_1 = px.choropleth(vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='% of Population with at least 1 dose',
                    color_continuous_scale="twilight",
                    )

In [None]:
fig_complete_65_plus = px.choropleth(vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='% of Population Fully Vaccinated (65+)',
                    color_continuous_scale="algae",                 
                    )

In [None]:
fig_bivalent_booster_65 = px.choropleth(vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='% of Population with bivalent booster (65+)',
                    color_continuous_scale="icefire",                  
                    )

In [None]:
fig_complete_total_pop

In [None]:
fig_at_least_1

In [None]:
fig_complete_65_plus

In [None]:
fig_bivalent_booster_65

In [None]:
#Making percent fully vaccinated into a list
percent_fully_vaccinated = vac_df['Percentage of Population Fully Vaccinated'].to_numpy()
print(percent_fully_vaccinated)

In [None]:
#boxplots showing spread of data across all 50 states and DC for selected columns
boxplot = vac_df.boxplot(column=['Percentage of Population Fully Vaccinated', 
                                 '% of Population with at least 1 dose', 
                                 '% of Population Fully Vaccinated (65+)', 
                                 '% of Population with bivalent booster (65+)'], 
                         rot=45,
                         grid=True,
                         figsize = (15,10),
                        )
plt.title("Distribution of Vaccination Rates Across U.S. States")

In [None]:
#summary statistics for selected columns (across all 50 states and DC) 
vac_df_short = vac_df[['Series_Complete_Pop_Pct', 'Administered_Dose1_Pop_Pct', 'Series_Complete_65PlusPop_Pct', 'Bivalent_Booster_65Plus_Pop_Pct']]

vac_df_short.describe()

# Sarah Work Area

# Aaliyah Work Area