# A Comparison of State Use of SFLRF Funds for Vaccination Programs and Vaccination Rates in Each State

### Data Sources:
CDC - "COVID-19 Vaccinations in the United States, Jurisdiction"
csv downloaded 5/11/23
https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-Jurisdi/unsk-b7fc

EARN/EPI - "EARN SLFRF Workbook for Q4 2022" compiled by Dave Kamper of the Economic Policy Institute (dkamper@epi.org) from Treasury reports by states and local jurisidictions who received funding, and other data sources as detailed in the workbook.

# Imports - Aaliyah, Evan, Greg, Joanna, Kendal

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
from pathlib import Path
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression

In [None]:
### Hide error messages
pd.options.mode.chained_assignment = None  # default='warn'

# Data Cleaning - Mostly Evan & Aaliyah

In [None]:
### Load csv file(s)
all_states_sheet = Path("Resources/EARN_all_states.csv")
state_summary_sheet = Path("Resources/state_summary.csv")

### Read csv file(s) as DataFrames
all_states_df = pd.read_csv(all_states_sheet, skipinitialspace= True, low_memory=False)
state_summary_df = pd.read_csv(state_summary_sheet, skipinitialspace= True)
vac_df = pd.read_csv('Resources/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv')
vac_df_2 = pd.read_csv('Resources/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv')

#change location to match state code
vac_df['Location'] = vac_df['Location'].str.replace('US_', '')

#remove spaces at the beginning and end of the string
all_states_df.columns = all_states_df.columns.str.strip()


### *EARN - Initial Data Cleaning* - Mostly Evan and Aaliyah

In [None]:
### Drop rows where the column has NaN value in all_states_df
    # source: https://towardsdatascience.com/how-to-drop-rows-in-pandas-dataframes-with-nan-values-in-certain-columns-7613ad1a7f25
    
all_states_df = all_states_df.dropna(subset=['Project Description'], how='all')

In [None]:
### Make the Project Description values all lowercase for value search in all_states_df:
all_states_df['Project Description'] = all_states_df['Project Description'].str.lower()

In [None]:
### Create dictionary in order to add a column of state name abbreviations to any DF:
    # source: https://gist.github.com/rogerallen/1583593

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands": "VI",
}

In [None]:
### Selecting the desired columns in state_summary_df and saving as new, reduced_df
reduced_df = state_summary_df[['State', 'Total state allocation (from the fed)',
                               'total state plus total local federal grant',
                               'Share of state allocation spent', 'Share of state allocation obligated',
                               'Share of state allocation budgeted', 'Total local allocation (from the fed)',
                               'Share of local spent', 'Share of local obligated', 'Share of local budgeted',
                               'Share of state + local spent']]



reduced_df['State'] = reduced_df['State'].replace('_',' ', regex=True)

### add abbreviated state name column and reorder so the abbrev is after full state name column:
reduced_df['Location'] = reduced_df['State'].map(us_state_to_abbrev)


reduced_df.rename(columns = {'State':'State/Territory'}, inplace = True)

In [None]:
### convert all budget columns to numeric values for reduced_df
### drop non-number values first:

reduced_df[['Total state allocation (from the fed)', 
            'total state plus total local federal grant',
            'Total local allocation (from the fed)']] = reduced_df[['Total state allocation (from the fed)', 
            'total state plus total local federal grant',
            'Total local allocation (from the fed)']].replace(['\$', '-', ' '] ,'', regex=True)


numeric_cols = ['Total state allocation (from the fed)', 
            'total state plus total local federal grant',
            'Total local allocation (from the fed)']


### convert budget columns to int for summarizing in groupby:
reduced_df = reduced_df.replace(',','', regex=True)
reduced_df[numeric_cols] = reduced_df[numeric_cols].apply(pd.to_numeric)

In [None]:
### Now replace percentage string values with a decimal float value dtype:
    # reduced_df[['Share of state allocation spent', 'Share of state allocation obligated', 'Share of state allocation budgeted']] = reduced_df[['Share of state allocation spent', 'Share of state allocation obligated', 'Share of state allocation budgeted']].str.rstrip('%').astype('float') / 100.0

convert_cols = ['Share of state allocation spent', 'Share of state allocation obligated', 'Share of state allocation budgeted', 'Share of local spent', 'Share of local obligated', 'Share of local budgeted', 'Share of state + local spent']

reduced_df = reduced_df.replace('%','', regex=True)

reduced_df[convert_cols] = reduced_df[convert_cols].astype(float)/100

### *CDC Vaccination - Initial Data Cleaning - *** - Greg's function

In [None]:
#function formats the CDC dataframe for US jursdictions - see below for input formats
def CDC_format(df,key,date,add_str,drop_values): #key, filt -> str; length -> int; columns, drop_values -> list
    df = df.dropna(subset=[key])
    df = df[vac_df['Date'] == date]
    df = df[~df[key].isin(drop_values)]
    df[key] = add_str + vac_df[key].astype(str)
    df.reset_index(drop = True, inplace = True)
    return df

In [None]:
drop = ['DD2','FM','AS','VI','BP2','IH2','GU','PN','PR','VA2','PW','US','MP','MH']
vac_df = CDC_format(vac_df,'Location','12/28/2022','US_',drop)
vac_df_2 = CDC_format(vac_df,'Location','12/28/2022','US_',drop)

### *Creating New DataFrames for Analysis -* -- Aaliyah, Evan, Greg, Joanna, and Kendal (please check this area)

In [None]:
## Brainstorm a list of words to filter the 'Project Description' column by.
    # this list will be used to filter that column so that we are only working with projects that
    # are actually vaccine related.

search_term_list = ['immunize', 'immunization','access to vaccines', 'spikevax', 'bivalent', 'novavax', 'two-dose', 
                    'single-dose', 'emergency use authoriztaion', 'vaccine coverage', 
                    'vaccine access', 'vaccine distribution', 'distribute vaccines', 'vaccine', 'vaccination', 'vaccinate', 'moderna', 'pfizer', 'johnson & johnson', 'janssen']

covid_projects_df = all_states_df[all_states_df['Project Description'].str.contains('|'.join(search_term_list))]



In [None]:
### Clean up values preventing change of data type to int
covid_projects_df[['Adopted Budget','Total Cumulative Obligations','Total Cumulative Expenditures']] = covid_projects_df[['Adopted Budget','Total Cumulative Obligations', 'Total Cumulative Expenditures']].replace(['-', ' '] ,'', regex=True)


numeric_cols = ['Adopted Budget','Total Cumulative Obligations', 'Total Cumulative Expenditures']


### Convert budget columns to int for summarizing in groupby:
covid_projects_df = covid_projects_df.replace(',','', regex=True)
covid_projects_df[numeric_cols] = covid_projects_df[numeric_cols].apply(pd.to_numeric)


In [None]:
### Group the filtered dataframe by state, summing applicable $ value columns
    # if errors, clean columns causing errors. 
    # eg) 'Adopted Budget' column has values containing "-". This might prevent the .sum() function from working

covid_sums_df = covid_projects_df.groupby(['State/Territory'], as_index=False).sum(['Adopted Budget', 'Total Cumulative Obligations', 
                                                                        'Total Cumulative Expenditures'])

### add abbreviated state name column and reorder so the abbrev is after full state name column:
covid_sums_df['Location'] = covid_sums_df['State/Territory'].map(us_state_to_abbrev)
covid_sums_df = covid_sums_df[['State/Territory', 'Location', 'Adopted Budget', 
                                       'Total Cumulative Obligations', 'Total Cumulative Expenditures']]
covid_counts_df = covid_projects_df.groupby(['State/Territory'], as_index=False).count()[['State/Territory', 'Project ID']]

### Now merge the vaccine projects count by state onto the state_spending_df:

state_spending_df = pd.merge(covid_sums_df, covid_counts_df, how ='inner', on =('State/Territory'))
### Rename the counted 'Project ID' column for clarity:
state_spending_df.rename(columns = {'Project ID':'Count of Vaccine Projects'}, inplace = True)


In [None]:
### merge this data frame with Evan's "state_spending_df". Merge on the state columns.
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html
    # https://www.geeksforgeeks.org/how-to-join-pandas-dataframes-using-merge/#

### EARN_states combines the three budget columns from "All_US_Projects" sheet with the entire "State Summary Table" sheet.
### The three budget columns are filtered for covid projects, but all dollar value columns in the "State Summary Table" are not filtered by covid projects.

EARN_states = pd.merge(state_spending_df, reduced_df, how ='inner', on =(['State/Territory', 'Location']))


In [None]:
### Add column to show percent of fed money spent per State:
### [Total Cumulative Expenditures]/[total state plus total local federal grant]
### sortby this new percent column.

EARN_states['Percent Spent on Covid Projects'] = state_spending_df['Total Cumulative Expenditures']/EARN_states['total state plus total local federal grant']

EARN_states.sort_values(by=['Percent Spent on Covid Projects'], ascending=False, inplace= True)


In [None]:
### "all_us_projects_df" is for (2) from Joanna's slack message request:
all_us_projects_df = all_states_df[['Recipient Name', 'State/Territory', 'Recipient Type', 
                                    'Completion Status', 'Project Name', 'Expenditure Category Group', 'Expenditure Category', 
                                    'Project Description', 'Adopted Budget', 'Total Cumulative Obligations', 
                                    'Total Cumulative Expenditures']].copy()


all_us_projects_df['State/Territory'] = all_us_projects_df['State/Territory'].map(us_state_to_abbrev)
all_us_projects_df.rename(columns = {'State/Territory':'State'}, inplace = True)

all_us_projects_df[['Adopted Budget','Total Cumulative Obligations',
                   'Total Cumulative Expenditures']] = all_us_projects_df[['Adopted Budget',
       'Total Cumulative Obligations', 'Total Cumulative Expenditures']].replace(['-', ' '] ,'', regex=True)


numeric_cols = ['Adopted Budget',
       'Total Cumulative Obligations', 'Total Cumulative Expenditures']


### convert budget columns to int for summarizing in groupby:
all_us_projects_df = all_us_projects_df.replace(',','', regex=True)
all_us_projects_df[numeric_cols] = all_us_projects_df[numeric_cols].apply(pd.to_numeric)

In [None]:
### "us_covid_projects_df" is for (3) from Joanna's slack message:
us_covid_projects_df = all_us_projects_df[all_us_projects_df['Project Description'].str.contains('|'.join(search_term_list))]

In [None]:
# get all the columns we will be interested in into one dataframe
# NOTE: there is no Pop_Pct column for the administered_bivalent, and second_booster only for the age breakouts
# but we can extrapolate from their other population calculations to calculate these. For second_booster to get state numbers
# we have to add up the vaccines from the different manufacturers because we don't have them already summed.

vac_df = pd.DataFrame(data=vac_df, columns=["Location", "Distributed", "Administered", "Recip_Administered",
                                                   "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                                                   "Administered_Dose1_Recip_5Plus", "Administered_Dose1_Recip_5PlusPop_Pct",
                                                  "Administered_Dose1_Recip_12Plus", "Administered_Dose1_Recip_12PlusPop_Pct",
                                                  "Administered_Dose1_Recip_18Plus", "Administered_Dose1_Recip_18PlusPop_Pct",
                                                  "Administered_Dose1_Recip_65Plus", "Administered_Dose1_Recip_65PlusPop_Pct",
                                                  "Series_Complete_Yes", "Series_Complete_Pop_Pct", "Series_Complete_5Plus",
                                                "Series_Complete_5PlusPop_Pct", "Series_Complete_12Plus",
                                                "Series_Complete_12PlusPop_Pct", "Series_Complete_18Plus",
                                                "Series_Complete_18PlusPop_Pct", "Series_Complete_65Plus",
                                                "Series_Complete_65PlusPop_Pct", "Additional_Doses",
                                                   "Additional_Doses_Vax_Pct", "Additional_Doses_5Plus",
                                                   "Additional_Doses_5Plus_Vax_Pct", "Additional_Doses_12Plus",
                                                   "Additional_Doses_12Plus_Vax_Pct", "Additional_Doses_18Plus",
                                                   "Additional_Doses_18Plus_Vax_Pct", "Additional_Doses_50Plus",
                                                   "Additional_Doses_50Plus_Vax_Pct", "Additional_Doses_65Plus",
                                                   "Additional_Doses_65Plus_Vax_Pct", "Second_Booster_50Plus",
                                                   "Second_Booster_50Plus_Vax_Pct", "Second_Booster_65Plus",
                                                   "Second_Booster_65Plus_Vax_Pct", "Second_Booster_Janssen",
                                                    "Second_Booster_Moderna", "Second_Booster_Pfizer",
                                                    "Second_Booster_Unk_Manuf", "Administered_Bivalent",
                                                   "Bivalent_Booster_5Plus", "Bivalent_Booster_5Plus_Pop_Pct",
                                                   "Bivalent_Booster_12Plus", "Bivalent_Booster_12Plus_Pop_Pct",
                                                   "Bivalent_Booster_18Plus", "Bivalent_Booster_18Plus_Pop_Pct",
                                                "Bivalent_Booster_65Plus", "Bivalent_Booster_65Plus_Pop_Pct"])

In [None]:
# remove commas from numeric columns
# convert numeric columns to correct type
vac_df = vac_df.replace(',','', regex=True)
numeric_cols = ["Distributed", "Administered", "Recip_Administered", "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                "Administered_Dose1_Recip_5Plus", "Administered_Dose1_Recip_5PlusPop_Pct", "Administered_Dose1_Recip_12Plus",
                "Administered_Dose1_Recip_12PlusPop_Pct", "Administered_Dose1_Recip_18Plus",
                "Administered_Dose1_Recip_18PlusPop_Pct", "Administered_Dose1_Recip_65Plus",
                "Administered_Dose1_Recip_65PlusPop_Pct", "Series_Complete_Yes", "Series_Complete_Pop_Pct",
                "Series_Complete_5Plus", "Series_Complete_12Plus", "Series_Complete_12PlusPop_Pct", "Series_Complete_18Plus",
                "Series_Complete_18PlusPop_Pct", "Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct", "Additional_Doses",
                "Additional_Doses_Vax_Pct", "Additional_Doses_5Plus", "Additional_Doses_5Plus_Vax_Pct", "Additional_Doses_12Plus",
                "Additional_Doses_12Plus_Vax_Pct", "Additional_Doses_18Plus", "Additional_Doses_18Plus_Vax_Pct",
                "Additional_Doses_50Plus", "Additional_Doses_50Plus_Vax_Pct", "Additional_Doses_65Plus",
                "Additional_Doses_65Plus_Vax_Pct", "Second_Booster_50Plus", "Second_Booster_50Plus_Vax_Pct",
                "Second_Booster_65Plus", "Second_Booster_65Plus_Vax_Pct", "Second_Booster_Janssen",
                "Second_Booster_Moderna", "Second_Booster_Pfizer", "Second_Booster_Unk_Manuf", "Administered_Bivalent",
                "Bivalent_Booster_5Plus", "Bivalent_Booster_5Plus_Pop_Pct", "Bivalent_Booster_12Plus",
                "Bivalent_Booster_12Plus_Pop_Pct", "Bivalent_Booster_18Plus", "Bivalent_Booster_18Plus_Pop_Pct"]
vac_df[numeric_cols] = vac_df[numeric_cols].apply(pd.to_numeric)

In [None]:
#create df with only columns related to choropleth maps

#change location to match state code for choropleth maps
vac_df_2['Location'] = vac_df_2['Location'].str.replace('US_', '')

choropleth_vac_df = vac_df_2[['Location', 
                            'Distributed', 
                            'Administered', 
                            'Administered_Dose1_Pop_Pct', 
                            'Series_Complete_Pop_Pct', 
                            'Series_Complete_5PlusPop_Pct', 
                            'Series_Complete_12PlusPop_Pct', 
                            'Series_Complete_18PlusPop_Pct', 
                            'Series_Complete_65PlusPop_Pct', 
                            'Additional_Doses_Vax_Pct', 
                            'Additional_Doses_65Plus_Vax_Pct', 
                            'Second_Booster_65Plus_Vax_Pct', 
                            'Bivalent_Booster_5Plus_Pop_Pct', 
                            'Bivalent_Booster_12Plus_Pop_Pct', 
                            'Bivalent_Booster_18Plus_Pop_Pct', 
                            'Bivalent_Booster_65Plus_Pop_Pct']]
choropleth_vac_df

# Analysis & Visualization - Mostly Greg, Joanna, and Kendal -- please check that the line regression stuff matches and that the DFs it is based on are as they should be

### *Line Regression -*

In [None]:
# calculate totals for second booster
vac_df["Second_Booster_Total"] = (vac_df["Second_Booster_Janssen"] + vac_df["Second_Booster_Moderna"]
                                + vac_df["Second_Booster_Pfizer"] + vac_df["Second_Booster_Unk_Manuf"])
# find their population number... ok this is off. ???
# I don't know why it is appearing they used different population numbers. Something is weird here. We could just use a number
# from the census, or just ignore the second booster.
vac_df["Pop1"] = vac_df["Series_Complete_Yes"] / (vac_df["Series_Complete_Pop_Pct"]/100)
vac_df["Pop2"] = vac_df["Administered_Dose1_Recip"] / (vac_df["Administered_Dose1_Pop_Pct"]/100)

vac_pops_df = pd.DataFrame(data=vac_df, columns=["Location", "Pop1", "Pop2"])


In [None]:
# df with vax data for all ages
vac_all_ages_df = pd.DataFrame(data=vac_df, columns=["Location", "Distributed", "Administered", "Recip_Administered",
                                                   "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                                                    "Series_Complete_Yes", "Series_Complete_Pop_Pct",
                                                     "Additional_Doses", "Additional_Doses_Vax_Pct", "Administered_Bivalent"])

# add 'Dose Differential' column to track doses administered to nonresidents. Negative number = doses leaving the state
vac_all_ages_df["Dose Differential"] = vac_all_ages_df["Administered"] - vac_all_ages_df["Recip_Administered"]
vac_all_ages_df["Dose Diff. as Pct of Doses Given"] = abs(vac_all_ages_df["Dose Differential"] / vac_all_ages_df["Administered"])
vac_all_ages_df["Dose Diff. as Pct of Residents Vaxxed"] = abs(vac_all_ages_df["Dose Differential"] / vac_all_ages_df["Recip_Administered"])


In [None]:
# df with dose differential between doses administered in a state and doses received by residents of a state
vac_dd_df = pd.DataFrame(data=vac_all_ages_df, columns=["Location", "Distributed", "Administered", "Recip_Administered", "Dose Differential",
                         "Dose Diff. as Pct of Doses Given", "Dose Diff. as Pct of Residents Vaxxed", "Administered_Dose1_Pop_Pct", "Series_Complete_Pop_Pct",
                         "Additional_Doses_Vax_Pct"])

In [None]:
# get rows with negative dose differential (states that administered lots of vaccine to people living elsewhere)
# sort in order of large differentials to small (as percent of total doses given)
# for example: in NM at least 3.8% of the doses were given to people who lived elsewhere
vac_dd_neg_df = vac_dd_df[vac_dd_df['Dose Differential'] < 1]
vac_dd_neg_df = vac_dd_neg_df.sort_values(by=['Dose Diff. as Pct of Doses Given'], ascending=False)

In [None]:
# get rows with positive dose differential (states with a lot of residents who were vaccinated elsewhere)
# sort in order of large differentials to small (as percent of total doses given)
# for example: in Arizona, at least 2.6% of the vaccinated population received doses elsewhere.
vac_dd_pos_df = vac_dd_df[vac_dd_df['Dose Differential'] >= 1]
vac_dd_pos_df = vac_dd_pos_df.sort_values(by=['Dose Diff. as Pct of Residents Vaxxed'], ascending=False)

In [None]:
# Checking on second booster columns
vac_secondbooster_df = pd.DataFrame(data=vac_df, columns=["Location", "Second_Booster_50Plus",
                                                   "Second_Booster_50Plus_Vax_Pct", "Second_Booster_65Plus",
                                                   "Second_Booster_65Plus_Vax_Pct", "Second_Booster_Janssen",
                                                    "Second_Booster_Moderna", "Second_Booster_Pfizer",
                                                    "Second_Booster_Unk_Manuf"])

In [None]:
# find how many doses were distributed vs administered
# calculate percent
# sort alphabetically by state
vac_waste_df = pd.DataFrame(data=vac_df, columns=["Location", "Distributed", "Administered"])
vac_waste_df["Pct. Administered"] = vac_waste_df["Administered"] / vac_waste_df["Distributed"]
vac_waste_df.sort_values('Location')

In [None]:
# Show best 10 states in vaccine distribution percentage
vac_waste_best_df = vac_waste_df.sort_values('Pct. Administered', ascending=False)
vac_waste_best_df.head(10)

In [None]:
# Show worst 10 states in vaccine distribution percentage
vac_waste_worst_df = vac_waste_df.sort_values('Pct. Administered', ascending=True)
vac_waste_worst_df.head(10)

In [None]:
# whole pop info
# NOTE: There is no percentage for the entire pop for bivalent so included the 5+ pop. If we have time, will pull in the same
# census data they used to get the correct pct
# second booster only has 50+ and 65+ % -- I'm sure there must be some reason for this, not sure what
vac_whole_pop_df = pd.DataFrame(data=vac_df, columns=["Location", "Distributed", "Administered", "Recip_Administered",
                                                   "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                                                   "Series_Complete_Yes", "Series_Complete_Pop_Pct", 
                                                    "Additional_Doses", "Additional_Doses_Vax_Pct", "Second_Booster_Total", 
                                                    "Second_Booster_Janssen", "Second_Booster_Moderna", "Second_Booster_Pfizer",
                                                    "Second_Booster_Unk_Manuf", "Administered_Bivalent", "Bivalent_Booster_5Plus", 
                                                      "Bivalent_Booster_5Plus_Pop_Pct"])

In [None]:
vac_5plus_df = pd.DataFrame(data=vac_df, columns=["Location", 
                                                   "Administered_Dose1_Recip_5Plus", "Administered_Dose1_Recip_5PlusPop_Pct",
                                                  "Series_Complete_5Plus", "Series_Complete_5PlusPop_Pct",
                                                   "Additional_Doses_5Plus", "Additional_Doses_5Plus_Vax_Pct", 
                                                   "Bivalent_Booster_5Plus", "Bivalent_Booster_5Plus_Pop_Pct"])

In [None]:
# 12 plus vaccinations
vac_12plus_df = pd.DataFrame(data=vac_df, columns=["Location", "Administered_Dose1_Recip_12Plus",
                                                   "Administered_Dose1_Recip_12PlusPop_Pct", "Series_Complete_12Plus",
                                                   "Series_Complete_12PlusPop_Pct", "Additional_Doses_12Plus",
                                                   "Additional_Doses_12Plus_Vax_Pct", "Bivalent_Booster_12Plus",
                                                   "Bivalent_Booster_12Plus_Pop_Pct"])

In [None]:
# 18 plus vaccinations
vac_18plus_df = pd.DataFrame(data=vac_df, columns=["Location", "Administered_Dose1_Recip_18Plus",
                                                   "Administered_Dose1_Recip_18PlusPop_Pct", "Series_Complete_18Plus",
                                                   "Series_Complete_18PlusPop_Pct", "Additional_Doses_18Plus",
                                                   "Additional_Doses_18Plus_Vax_Pct", "Bivalent_Booster_18Plus",
                                                   "Bivalent_Booster_18Plus_Pop_Pct"])

In [None]:
# 65 plus vaccinations
vac_65plus_df = pd.DataFrame(data=vac_df, columns=["Location", "Administered_Dose1_Recip_65Plus",
                                                   "Administered_Dose1_Recip_65PlusPop_Pct", "Series_Complete_65Plus",
                                                   "Series_Complete_65PlusPop_Pct", "Additional_Doses_65Plus",
                                                   "Additional_Doses_65Plus_Vax_Pct", "Second_Booster_50Plus",
                                                   "Second_Booster_50Plus_Vax_Pct", "Second_Booster_65Plus",
                                                   "Second_Booster_65Plus_Vax_Pct", "Bivalent_Booster_65Plus",
                                                   "Bivalent_Booster_65Plus_Pop_Pct"])

In [None]:
# first dose info

vac_firstdose_df = pd.DataFrame(data=vac_df, columns=["Location", "Administered_Dose1_Recip", "Administered_Dose1_Pop_Pct",
                                                   "Administered_Dose1_Recip_5Plus", "Administered_Dose1_Recip_5PlusPop_Pct",
                                                  "Administered_Dose1_Recip_12Plus", "Administered_Dose1_Recip_12PlusPop_Pct",
                                                  "Administered_Dose1_Recip_18Plus", "Administered_Dose1_Recip_18PlusPop_Pct",
                                                  "Administered_Dose1_Recip_65Plus", "Administered_Dose1_Recip_65PlusPop_Pct"])

In [None]:
# series complete info
vac_series_complete_df = pd.DataFrame(data=vac_df, columns=["Location", "Series_Complete_Yes", "Series_Complete_Pop_Pct",
                                                            "Series_Complete_5Plus", "Series_Complete_5PlusPop_Pct",
                                                            "Series_Complete_12Plus", "Series_Complete_12PlusPop_Pct",
                                                            "Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
                                                            "Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct"])

In [None]:
vac_additional_doses_df = pd.DataFrame(data=vac_df, columns=["Location", "Additional_Doses",
                                                   "Additional_Doses_Vax_Pct", "Additional_Doses_5Plus",
                                                   "Additional_Doses_5Plus_Vax_Pct", "Additional_Doses_12Plus",
                                                   "Additional_Doses_12Plus_Vax_Pct", "Additional_Doses_18Plus",
                                                   "Additional_Doses_18Plus_Vax_Pct", "Additional_Doses_50Plus",
                                                   "Additional_Doses_50Plus_Vax_Pct", "Additional_Doses_65Plus",
                                                   "Additional_Doses_65Plus_Vax_Pct"])

In [None]:
vac_second_booster_df = pd.DataFrame(data=vac_df, columns=["Location", "Second_Booster_50Plus",
                                                           "Second_Booster_50Plus_Vax_Pct", "Second_Booster_65Plus",
                                                           "Second_Booster_65Plus_Vax_Pct", "Second_Booster_Total",
                                                           "Second_Booster_Janssen", "Second_Booster_Moderna",
                                                           "Second_Booster_Pfizer", "Second_Booster_Unk_Manuf"])


In [None]:
vac_bivalent_df = pd.DataFrame(data=vac_df, columns=["Location", "Administered_Bivalent", "Bivalent_Booster_5Plus",
                                                     "Bivalent_Booster_5Plus_Pop_Pct", "Bivalent_Booster_12Plus",
                                                     "Bivalent_Booster_12Plus_Pop_Pct", "Bivalent_Booster_18Plus",
                                                     "Bivalent_Booster_18Plus_Pop_Pct", "Bivalent_Booster_65Plus",
                                                     "Bivalent_Booster_65Plus_Pop_Pct"])

In [None]:
#Adding regression analysis for combined financial and vaccination data (5/16/2023 - GM)

combined_df = vac_whole_pop_df.merge(EARN_states, how = 'inner',on = 'Location')
combined_df.columns

In [None]:
combined_df = combined_df.dropna(axis = 0, how = 'any')
combined_df.shape

In [None]:
def pd_reg(df,x,y,x_text,y_text):
    fig = sns.lmplot(data = df, x = x, y = y)
    lm = LinearRegression() 
    X = df[[x]]
    Y = df[y]
    model = lm.fit(X,Y)
    print(f"For x = {x} and y = {y}:")
    print(f"The R^2 score is: {model.score(X,Y)}")
    plt.text(x_text, y_text, f"y = {'%.2f' %lm.coef_}x + {'%.1f' %lm.intercept_}", color = 'black', fontsize = 16)
    plt.show()
    return model

In [None]:
#Adding the names of the columns that we want to use as regression features
feat_list = ['Percent Spent on Covid Projects', 'Count of Vaccine Projects', 'total state plus total local federal grant']
target = 'Administered_Dose1_Pop_Pct'

In [None]:
for var in feat_list:
    print(f"The regression feature is: {var}")
    pd_reg(combined_df, var, target, combined_df[var].mean(), combined_df[target].mean())

In [None]:
target = 'Series_Complete_Pop_Pct'

In [None]:
for var in feat_list:
    print(f"The regression feature is: {var}")
    pd_reg(combined_df, var, target, combined_df[var].mean(), combined_df[target].mean())

In [None]:
#Looking at vac_dd_neg_df
vac_dd_neg_fin = vac_dd_neg_df.merge(EARN_states, how = 'inner',on = 'Location')
vac_dd_neg_fin.shape

In [None]:
target = 'Dose Diff. as Pct of Doses Given'

In [None]:
for var in feat_list:
    print(f"The regression feature is: {var}")
    pd_reg(vac_dd_neg_fin, var, target, vac_dd_neg_fin[var].mean(), vac_dd_neg_fin[target].mean())

In [None]:
#Looking at vac_dd_pos_df
vac_dd_pos_fin = vac_dd_pos_df.merge(EARN_states, how = 'inner',on = 'Location')
vac_dd_pos_fin.shape

In [None]:
target = 'Dose Diff. as Pct of Residents Vaxxed'

In [None]:
for var in feat_list:
    print(f"The regression feature is: {var}")
    pd_reg(vac_dd_pos_fin, var, target, vac_dd_pos_fin[var].mean(), vac_dd_pos_fin[target].mean())

### *Choropleth Maps -* - Kendal based on original code that Greg found via plotly express module

In [None]:
#determining color range to use for the continuous color scale in choropleth maps for bivalent booster status
choropleth_vac_bivalent = choropleth_vac_df[['Bivalent_Booster_5Plus_Pop_Pct', 
                                             'Bivalent_Booster_12Plus_Pop_Pct', 
                                             'Bivalent_Booster_18Plus_Pop_Pct', 
                                             'Bivalent_Booster_65Plus_Pop_Pct']]
choropleth_vac_bivalent = choropleth_vac_bivalent
min_bivalent_df = choropleth_vac_bivalent.min()
max_bivalent_df = choropleth_vac_bivalent.max()
min_color_range_bivalent = min_bivalent_df.min()
max_color_range_bivalent = max_bivalent_df.max()
print(f"The color range for any bivalent vaccination status choropleth map should be ({min_color_range_bivalent}, {max_color_range_bivalent}).")



In [None]:
#determining color range to use for the continuous color scale in choropleth maps for fully vaccinated, and partially or fully vaccinated
choropleth_vac_complete = choropleth_vac_df[['Administered_Dose1_Pop_Pct',
                                             'Series_Complete_Pop_Pct',
                                             'Series_Complete_5PlusPop_Pct', 
                                             'Series_Complete_12PlusPop_Pct', 
                                             'Series_Complete_18PlusPop_Pct', 
                                             'Series_Complete_65PlusPop_Pct']]
choropleth_vac_complete = choropleth_vac_complete
min_df = choropleth_vac_complete.min()
max_df = choropleth_vac_complete.max()
min_color_range = min_df.min()
max_color_range = max_df.max()
print(f"The color range for any complete vaccination choropleth map should be ({min_color_range}, {max_color_range}).")

In [None]:
fig_complete_total_pop = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Series_Complete_Pop_Pct',
                    labels={'Series_Complete_Pop_Pct':'% of Population Fully Vaccinated'},
                    color_continuous_scale="viridis_r",
                    range_color=(52,95),
                    title='Vaccination Status by State - Fully Vaccinated'
                    )
fig_complete_total_pop

In [None]:
fig_complete_5plus = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Series_Complete_5PlusPop_Pct',
                    range_color=(52,95),
                    labels={'Series_Complete_5PlusPop_Pct':'% of 5+ Population Fully Vaccinated'},
                    color_continuous_scale="viridis_r",
                    title='Vaccination Status by State - Fully Vaccinated (5+)'
                    )
fig_complete_5plus

In [None]:
fig_complete_12plus = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Series_Complete_12PlusPop_Pct',
                    labels={'Series_Complete_12PlusPop_Pct':'% of 12+ Population Fully Vaccinated'},
                    color_continuous_scale="viridis_r",
                    range_color=(52,95),
                    title='Vaccination Status by State - Fully Vaccinated (12+)'
                    )
fig_complete_12plus

In [None]:
fig_complete_18plus = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Series_Complete_18PlusPop_Pct',
                    labels={'Series_Complete_18PlusPop_Pct':'% of 18+ Population Fully Vaccinated'},
                    color_continuous_scale="viridis_r",
                    range_color=(52,95),
                    title='Vaccination Status by State - Fully Vaccinated (18+)'
                    )
fig_complete_18plus

In [None]:
fig_complete_65_plus = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Series_Complete_65PlusPop_Pct',
                    range_color=(52,95),
                    labels={'Series_Complete_65PlusPop_Pct':'% of 65+ Population Fully Vaccinated'},
                    color_continuous_scale="viridis_r", 
                    title='Vaccination Status by State & Age - Fully Vaccinated (65+)'
                    )
fig_complete_65_plus

In [None]:
fig_at_least_1 = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Administered_Dose1_Pop_Pct',
                    labels={'Administered_Dose1_Pop_Pct':'% of Population Partially or Fully Vaccinated'},
                    color_continuous_scale="viridis_r",
                    range_color=(52,95),
                    title='Vaccination Status by State - Partially or Fully Vaccinated'
                    )
fig_at_least_1

In [None]:
fig_bivalent_booster_5 = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Bivalent_Booster_5Plus_Pop_Pct',
                    labels={'Bivalent_Booster_5Plus_Pop_Pct':'% of 5+ Population with Bivalent Booster'},
                    color_continuous_scale="magma_r",
                    range_color=(5,63),
                    title='Bivalent Booster Status by State - (5+)'
                    )
fig_bivalent_booster_5

In [None]:
fig_bivalent_booster_12 = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Bivalent_Booster_12Plus_Pop_Pct',
                    labels={'Bivalent_Booster_12Plus_Pop_Pct':'% of 12+ Population with Bivalent Booster'},
                    color_continuous_scale="magma_r",  
                    range_color=(5,63),
                    title='Bivalent Booster Status by State - (12+)'
                    )
fig_bivalent_booster_12

In [None]:
fig_bivalent_booster_18 = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Bivalent_Booster_18Plus_Pop_Pct',
                    labels={'Bivalent_Booster_18Plus_Pop_Pct':'% of 18+ Population with Bivalent Booster'},
                    color_continuous_scale="magma_r",  
                    range_color=(5,63),
                    title='Bivalent Booster Status by State - (18+)'
                    )
fig_bivalent_booster_18

In [None]:
fig_bivalent_booster_65 = px.choropleth(choropleth_vac_df,
                    locations='Location',
                    locationmode="USA-states",
                    scope="usa",
                    color='Bivalent_Booster_65Plus_Pop_Pct',
                    labels={'Bivalent_Booster_65Plus_Pop_Pct':'% of 65+ Population with Bivalent Booster'},
                    color_continuous_scale="magma_r",  
                    range_color=(5,63),
                    title='Bivalent Booster Status by State - (65+)'
                    )
fig_bivalent_booster_65

In [None]:
#boxplots showing spread of data across all 50 states and DC for selected columns
boxplot = vac_df.boxplot(column=['Series_Complete_Pop_Pct', 
                                 'Administered_Dose1_Pop_Pct', 
                                 'Series_Complete_65PlusPop_Pct', 
                                 'Bivalent_Booster_65Plus_Pop_Pct'], 
                         grid=True,
                         figsize = (20,15),
                        )
plt.title("Distribution of Vaccination Rates Across U.S. States")
plt.xticks([1, 2, 3, 4], ['% Pop. Fully Vaccinated', '% Pop. Partially  or Fully Vaccinated', '% Pop. Fully Vaccinated - 65+', '% Pop. Bivalent Booster - 65+'])
plt.savefig('Resources/boxplot.png')

In [None]:
#finding percentage of all projects that fall in each of the 7 expenditure category groups
spending_overall = all_us_projects_df.groupby('Expenditure Category Group').count()
total_projects = spending_overall['State'].sum()
categories_percentage_overall = (spending_overall['State']/total_projects)*100
categories_percentage_overall = pd.DataFrame(categories_percentage_overall)
categories_percentage_overall.rename(columns={'State': '% of Total Projects'}, inplace=True)
#categories_percentage_overall = categories_percentage_overall.reset_index()
categories_percentage_overall

In [None]:
#finding percentage of covid-related projects that fall in each of the 7 expenditure category groups
spending_covid_project = covid_projects_df.groupby("Expenditure Category Group").count()
total_projects_covid = spending_covid_project['Recipient Name'].sum()
categories_percentage_covid = (spending_covid_project['Recipient Name']/total_projects_covid)*100
categories_percentage_covid = pd.DataFrame(categories_percentage_covid)
categories_percentage_covid.rename(columns={'Recipient Name':'% of Covid Projects'}, inplace=True)
categories_percentage_covid.reset_index()

In [None]:
#plotting expenditure categories - all projects
overall_spend_group = categories_percentage_overall['% of Total Projects']
categories_percentage_overall['Category']=['Public Health', 'Negative Economic Impacts', 'Public Health - Neg. Ec. Imp.', 'Premium Pay', 'Infrastructure', 'Revenue Replacement', 'Administrative']
plt.pie(overall_spend_group, autopct='%1.0f%%', radius=2)
plt.legend(categories_percentage_overall['Category'],
           loc="center right",
           title='All Projects',
           bbox_to_anchor=(1.6, 0, 0.5, 1))
plt.savefig('Resources/category_spending_all', bbox_inches = 'tight')

In [None]:
#plotting expenditure categories - covid projects
covid_spend_group = categories_percentage_covid['% of Covid Projects']
categories_percentage_overall['Category']=['Public Health', 'Negative Economic Impacts', 'Public Health - Neg. Ec. Imp.', 'Premium Pay', 'Infrastructure', 'Revenue Replacement', 'Administrative']
plt.pie(covid_spend_group, autopct='%1.0f%%', radius=2)
plt.legend(categories_percentage_overall['Category'],
           loc="center right",
           title='Covid Projects',
           bbox_to_anchor=(1.6, 0, 0.5, 1))
plt.savefig('Resources/category_spending_covid', bbox_inches = 'tight')
