In [4]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
from census import Census

# Census & gmaps API Keys
from config import (api_key)
c = Census(api_key, year=2019)


In [6]:
# Create File Path for three files. 
csv_pfe =  'source_data/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Pfizer.csv'
csv_mrna = 'source_data/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Moderna.csv'
csv_jnj =  'source_data/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Janssen.csv'
csv_c_d =  'source_data/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv'
state_code = 'source_data/state_code.csv'

# Create a dataframe for each file.
pfe_df = pd.read_csv(csv_pfe)
mrna_df = pd.read_csv(csv_mrna)
jnj_df = pd.read_csv(csv_jnj)
c_d_df = pd.read_csv(csv_c_d)
state_code_df = pd.read_csv(state_code)


In [7]:
# Merge pfe and mrna datframe
pfe_mrna = pd.merge(pfe_df, mrna_df, on=['Jurisdiction', 'Week of Allocations'], how='outer')
pfe_mrna

pfe_mrna_jnj = pd.merge(pfe_mrna, jnj_df, on=['Jurisdiction', 'Week of Allocations'], how='outer')
pfe_mrna_jnj
# pfe_mrna_jnj.to_csv('source_data/new_dataframes/Combined_Vaccine_df.csv')

#Rename Columns in the DataFrame and Fill all the NaN with Zero to work with the data
pfe_mrna_jnj_final = pfe_mrna_jnj.rename(columns={'1st Dose Allocations_x': 'PFE Dose-1',
                                                   '2nd Dose Allocations_x': 'PFE Dose-2',
                                                   '1st Dose Allocations_y': 'MRNA Dose-1',
                                                   '2nd Dose Allocations_y': 'MRNA Dose-2',
                                                   '1st Dose Allocations': 'JNJ Dose-1'
                                                   }).fillna(0)

pfe_mrna_jnj_final

Unnamed: 0,Jurisdiction,Week of Allocations,PFE Dose-1,PFE Dose-2,MRNA Dose-1,MRNA Dose-2,JNJ Dose-1
0,Connecticut,04/19/2021,54990,54990,40400.0,40400.0,0.0
1,Maine,04/19/2021,21060,21060,15400.0,15400.0,0.0
2,Massachusetts,04/19/2021,105300,105300,77700.0,77700.0,0.0
3,New Hampshire,04/19/2021,21060,21060,15500.0,15500.0,0.0
4,Rhode Island,04/19/2021,16380,16380,12100.0,12100.0,0.0
...,...,...,...,...,...,...,...
1192,Virginia,12/14/2020,72150,72150,0.0,0.0,0.0
1193,Washington,12/14/2020,62400,62400,0.0,0.0,0.0
1194,West Virginia,12/14/2020,16575,16575,0.0,0.0,0.0
1195,Wisconsin,12/14/2020,49725,49725,0.0,0.0,0.0


In [8]:
# merge state codes with the Cases and Death DF to get the State/District Name from the file. and fill all the null values with a Zero
m_c_d_df = pd.merge(c_d_df, state_code_df, left_on='state', right_on='Postal Code', how='outer').fillna(0)
m_c_d_df

# Create a new column for months and Year.
m_c_d_df['Month'] = pd.DatetimeIndex(m_c_d_df['submission_date']).month
m_c_d_df['Year'] = pd.DatetimeIndex(m_c_d_df['submission_date']).year
# m_c_d_df.to_csv('source_data/new_dataframes/c_d_withstate_df.csv')

# create a new groupby DataFrame to sum by year, month and state. 
group_c_d_df = m_c_d_df.groupby(['Year', 'Month', 'State/District']).sum().reset_index()
# group_c_d_df

#Then Reset Index so a wrong data can be removed
c_d_f_final = group_c_d_df[(group_c_d_df['Year'] != 1970) & (group_c_d_df['State/District'] != 0)].reset_index()
c_d_f_final



Unnamed: 0,index,Year,Month,State/District,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death
0,3,2020,1,Alabama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,2020,1,Alaska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2020,1,American Samoa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,2020,1,Arizona,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,2020,1,Arkansas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,925,2021,4,Virginia,10779674.0,8405929.0,2373745.0,25331.0,7326.0,177613.0,148968.0,28645.0,312.0,45.0
908,926,2021,4,Washington,6373673.0,0.0,0.0,21063.0,3832.0,90416.0,0.0,0.0,133.0,0.0
909,927,2021,4,West Virginia,2470246.0,1951426.0,518820.0,6779.0,1930.0,46420.0,0.0,0.0,104.0,19.0
910,928,2021,4,Wisconsin,10942905.0,9922905.0,1020000.0,14908.0,2163.0,125312.0,113347.0,11965.0,107.0,18.0


In [9]:
# Preparing pfe_mrna_jnj_final Dataframe to merge with c_d_f_final

pfe_mrna_jnj_final['Month'] = pd.DatetimeIndex(pfe_mrna_jnj_final['Week of Allocations']).month 
pfe_mrna_jnj_final['Year'] = pd.DatetimeIndex(pfe_mrna_jnj_final['Week of Allocations']).year 
pfe_mrna_jnj_groupsum = pfe_mrna_jnj_final.groupby(['Year', 'Month', 'Jurisdiction']).sum().reset_index()
pfe_mrna_jnj_groupsum


Unnamed: 0,Year,Month,Jurisdiction,PFE Dose-1,PFE Dose-2,MRNA Dose-1,MRNA Dose-2,JNJ Dose-1
0,2020,12,Alabama,108225,108225,113400.0,113400.0,0.0
1,2020,12,Alaska,35100,35100,26800.0,26800.0,0.0
2,2020,12,American Samoa,5850,0,2400.0,0.0,0.0
3,2020,12,Arizona,154050,154050,160700.0,160700.0,0.0
4,2020,12,Arkansas,67275,67275,68800.0,68800.0,0.0
...,...,...,...,...,...,...,...,...
310,2021,4,Virginia,358020,358020,266300.0,266300.0,139500.0
311,2021,4,Washington,313560,313560,233100.0,233100.0,121900.0
312,2021,4,West Virginia,80730,80730,59400.0,59400.0,31100.0
313,2021,4,Wisconsin,246870,246870,183800.0,183800.0,96100.0


In [10]:
# Merging two pfe_mrna_jnj_final and c_d_f_final. This will be useful for final analysis of New case vs the doses administered.

merge_CD_pfe_mrna_jnj = pd.merge(pfe_mrna_jnj_groupsum, c_d_f_final, left_on=['Year', 'Month', 'Jurisdiction'], right_on=['Year', 'Month', 'State/District'], how='inner')

# merge_CD_pfe_mrna_jnj.to_csv('source_data/new_dataframes/merge_CD_pfe_mrna_jnj.csv')

merge_CD_pfe_mrna_jnj


Unnamed: 0,Year,Month,Jurisdiction,PFE Dose-1,PFE Dose-2,MRNA Dose-1,MRNA Dose-2,JNJ Dose-1,index,State/District,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death
0,2020,12,Alabama,108225,108225,113400.0,113400.0,0.0,641,Alabama,9513716.0,7781720.0,1731996.0,111702.0,29144.0,131827.0,116236.0,15591.0,1249.0,276.0
1,2020,12,Alaska,35100,35100,26800.0,26800.0,0.0,642,Alaska,1241538.0,0.0,0.0,14138.0,0.0,5331.0,0.0,0.0,85.0,0.0
2,2020,12,American Samoa,5850,0,2400.0,0.0,0.0,643,American Samoa,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020,12,Arizona,154050,154050,160700.0,160700.0,0.0,644,Arizona,13267357.0,12733646.0,533711.0,193390.0,15013.0,237324.0,217466.0,19858.0,2225.0,371.0
4,2020,12,Arkansas,67275,67275,68800.0,68800.0,0.0,645,Arkansas,5960217.0,0.0,0.0,67779.0,19051.0,95239.0,0.0,0.0,1174.0,381.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,2021,4,Virginia,358020,358020,266300.0,266300.0,139500.0,925,Virginia,10779674.0,8405929.0,2373745.0,25331.0,7326.0,177613.0,148968.0,28645.0,312.0,45.0
271,2021,4,Washington,313560,313560,233100.0,233100.0,121900.0,926,Washington,6373673.0,0.0,0.0,21063.0,3832.0,90416.0,0.0,0.0,133.0,0.0
272,2021,4,West Virginia,80730,80730,59400.0,59400.0,31100.0,927,West Virginia,2470246.0,1951426.0,518820.0,6779.0,1930.0,46420.0,0.0,0.0,104.0,19.0
273,2021,4,Wisconsin,246870,246870,183800.0,183800.0,96100.0,928,Wisconsin,10942905.0,9922905.0,1020000.0,14908.0,2163.0,125312.0,113347.0,11965.0,107.0,18.0


In [15]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E", "B23025_005E"), {'for': 'state:*'})

# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

# Column Reordering
census_pd = census_pd.rename(columns={
                                      "B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "B23025_005E": "Unemployment Count",
                                      "NAME": "Name", "state": "State"})

# Add in Poverty Rate (Poverty Count / Population)
census_pd["Poverty Rate"] = 100 * \
    census_pd["Poverty Count"].astype(
        int) / census_pd["Population"].astype(int)

census_pd["Unemployment Rate"] = 100 * \
    census_pd["Unemployment Count"].astype(
        int) / census_pd["Population"].astype(int)

# Final DataFrame
census_pd = census_pd[["Name", "Population", "Median Age", "Household Income",
                       "Per Capita Income", "Poverty Count", "Poverty Rate", "Unemployment Count", "Unemployment Rate"]]

# Visualize
print(len(census_pd))
# census_pd.to_csv('source_data/new_dataframes/Population.csv')
census_pd.head()

52


Unnamed: 0,Name,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Unemployment Count,Unemployment Rate
0,Alabama,4876250.0,39.0,50536.0,27928.0,795989.0,16.323794,132095.0,2.708946
1,Alaska,737068.0,34.3,77640.0,36787.0,76933.0,10.437707,26808.0,3.637114
2,Arizona,7050299.0,37.7,58945.0,30694.0,1043764.0,14.804535,195905.0,2.778676
3,Arkansas,2999370.0,38.1,47597.0,26577.0,496260.0,16.545475,70481.0,2.34986
4,California,39283497.0,36.5,75235.0,36955.0,5149742.0,13.109174,1199233.0,3.052765
