In [1]:
import pandas as pd
import io, requests, os 
from bs4 import BeautifulSoup
from tqdm import tqdm

output_folder = 'CovidOct18-2022'
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

# PVI data

In [2]:
# read in fips codes
fips = pd.read_csv(
    "https://github.com/COVID19PVI/data/raw/master/Model11.2.1/data/Model_11.2.1_20200228_data.csv", 
    header=12, usecols=["name","casrn"]
).rename(columns={'name':'Name','casrn':'FIPS'})

# get the PVI data links
r = requests.get("https://github.com/COVID19PVI/data/tree/master/Model11.2.1")
soup = BeautifulSoup(r.content, 'html.parser')
links = [
    x.attrs["href"].replace("blob","raw")
    for x in soup.find_all('a', {'class':'js-navigation-open Link--primary'})[1:-1]
]

In [3]:
# combine data
all_data = []
read_from_local = True # Set to true if github isn't responding

if read_from_local:
    filenames = [link.split('/')[-1] for link in links]
    model_path = '../../data/Model11.2.1/' # PVI git repo cloned here
    for filename in tqdm(filenames):
        new_data = pd.read_csv(os.path.join(model_path, filename))
        new_data["date"] = filename.split("_")[2]
        all_data.append(new_data)

else:
    for link in tqdm(links):
        new_data = pd.read_csv("https://github.com"+link)
        new_data["date"] = link.split("_")[2]
        all_data.append(new_data)

all_data = pd.concat(all_data)
all_data["date"] = [f"{i[:4]}-{i[4:6]}-{i[6:]}" for i in all_data.date]
all_data.head()

100%|██████████| 963/963 [00:17<00:00, 54.03it/s]


Unnamed: 0,ToxPi Score,HClust Group,KMeans Group,Name,Source,Infection Rate: Transmissible Cases!25!0xcc3333ff,Infection Rate: Disease Spread!5!0xe64d4dff,Pop Concentration: Pop Mobility!10!0x57b757ff,Pop Concentration: Residential Density!10!0x5ced5cff,Intervention: Social Distancing!10!0x4258c9ff,Intervention: Testing!10!0x6079f7ff,Health & Environment: Pop Demographics!10!0x6b0b9eff,Health & Environment: Air Pollution!10!0x8e26c4ff,Health & Environment: Age Distribution!10!0x9a42c8ff,Health & Environment: Co-morbidities!10!0xb460e0ff,Health & Environment: Health Disparities!10!0xc885ecff,Health & Environment: Hospital Beds!5!0xdeb9f1ff,date
0,0.659739,1,1,"California, Solano","-121.9357,38.2683",1.0,1.0,0.717074,0.6905,1.0,0.504366,0.622993,0.479042,0.504195,0.236237,0.235189,0.514283,2020-02-28
1,0.624339,2,1,"California, Madera","-119.7666,37.2151",0.638652,1.0,0.601609,0.8239,0.825,0.504366,0.605671,0.658683,0.469686,0.321223,0.590624,0.61368,2020-02-28
2,0.598916,1,1,"California, Santa Cruz","-122.0067,37.0558",0.817887,1.0,0.704589,0.9035,1.0,0.504366,0.409785,0.269461,0.511724,0.100021,0.272557,0.531471,2020-02-28
3,0.590967,2,1,"California, Imperial","-115.3669,33.0393",0.591782,0.666667,0.585525,0.9869,0.75,0.504366,0.525758,0.628743,0.443774,0.270404,0.620412,0.51683,2020-02-28
4,0.578017,1,1,"California, Santa Clara","-121.697,37.231",0.830152,0.88,0.793513,0.731,1.0,0.504366,0.452019,0.353293,0.452613,0.060895,0.131427,0.461418,2020-02-28


In [4]:
drop_columns = ['Name', 'Source','date', 'ToxPi Score', 'HClust Group', 'KMeans Group', 'Hospital Beds']
for column in all_data.columns:
    column_name = column.split('!')[0]
    if ":" in column_name:
        column_name = column_name.split(": ")[1]

    if column_name in drop_columns:
        continue
    print(f"Saving {column_name}", end = "... ")
    mx = all_data.pivot(index="Name",columns="date", values=column).reset_index()
    mx = pd.merge(mx, fips, on="Name", how="left")
    mx.insert(0, "FIPS", mx.pop("FIPS"))
    mx.to_csv(f"{output_folder}/{column_name}.csv", index=False, float_format='%.4f')
    print(u'\u2705')

Saving Transmissible Cases... ✅
Saving Disease Spread... ✅
Saving Pop Mobility... ✅
Saving Residential Density... ✅
Saving Social Distancing... ✅
Saving Testing... ✅
Saving Pop Demographics... ✅
Saving Air Pollution... ✅
Saving Age Distribution... ✅
Saving Co-morbidities... ✅
Saving Health Disparities... ✅


In [5]:
all_data.columns

Index(['ToxPi Score', 'HClust Group', 'KMeans Group', 'Name', 'Source',
       'Infection Rate: Transmissible Cases!25!0xcc3333ff',
       'Infection Rate: Disease Spread!5!0xe64d4dff',
       'Pop Concentration: Pop Mobility!10!0x57b757ff',
       'Pop Concentration: Residential Density!10!0x5ced5cff',
       'Intervention: Social Distancing!10!0x4258c9ff',
       'Intervention: Testing!10!0x6079f7ff',
       'Health & Environment: Pop Demographics!10!0x6b0b9eff',
       'Health & Environment: Air Pollution!10!0x8e26c4ff',
       'Health & Environment: Age Distribution!10!0x9a42c8ff',
       'Health & Environment: Co-morbidities!10!0xb460e0ff',
       'Health & Environment: Health Disparities!10!0xc885ecff',
       'Health & Environment: Hospital Beds!5!0xdeb9f1ff', 'date'],
      dtype='object')

# Cases
Source [USAFacts](https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/). Data collection starts from 2020-01-22.

In [6]:
response = requests.get("https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv")

file_object = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(file_object)
df.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-10-04,2022-10-05,2022-10-06,2022-10-07,2022-10-08,2022-10-09,2022-10-10,2022-10-11,2022-10-12,2022-10-13
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,18396,18396,18422,18422,18422,18422,18422,18422,18422,18452
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,65653,65653,65742,65742,65742,65742,65742,65742,65742,65819
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,6896,6896,6910,6910,6910,6910,6910,6910,6910,6910
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,7526,7526,7537,7537,7537,7537,7537,7537,7537,7547


In [7]:
df.drop(columns=['County Name', 'State', 'StateFIPS'], inplace=True)
df.rename({'countyFIPS':'FIPS'}, axis=1, inplace=True)

df = df[(~df['FIPS'].isna()) & (df['FIPS']>0)]
df.fillna(0, inplace=True)
df.to_csv(f'{output_folder}/Cases.csv', index=False)

# Deaths
Source [USAFacts](https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/). Data collection starts from 2020-01-22.

In [8]:
response = requests.get("https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv")
file_object = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(file_object)
df.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-10-04,2022-10-05,2022-10-06,2022-10-07,2022-10-08,2022-10-09,2022-10-10,2022-10-11,2022-10-12,2022-10-13
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,227,227,228,228,228,228,228,228,228,228
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,712,712,714,714,714,714,714,714,714,715
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,103,103,103,103,103,103,103,103,103,103
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,107,107,108,108,108,108,108,108,108,108


In [9]:
df.drop(columns=['County Name', 'State', 'StateFIPS'], inplace=True)
df.rename({'countyFIPS':'FIPS'}, axis=1, inplace=True)

df = df[(~df['FIPS'].isna()) & (df['FIPS']>0)]
df.fillna(0, inplace=True)
df.to_csv(f'{output_folder}/Deaths.csv', index=False)

# Vaccination
From [CDC vaccinations in the US counties](https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-County/8xkx-amqh). Data collection starts from 2020-12-13.

In [10]:
response = requests.get("https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD")
file_object = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(file_object, low_memory=False)
df.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_5PlusPop_Pct,...,Booster_Doses_Vax_Pct_UR_Equity,Booster_Doses_12PlusVax_Pct_UR_Equity,Booster_Doses_18PlusVax_Pct_UR_Equity,Booster_Doses_65PlusVax_Pct_UR_Equity,Census2019,Census2019_5PlusPop,Census2019_5to17Pop,Census2019_12PlusPop,Census2019_18PlusPop,Census2019_65PlusPop
0,10/12/2022,27047,41,Freeborn County,MN,94.7,18880.0,62.3,18788.0,65.7,...,8.0,8.0,8.0,8.0,30281.0,28615.0,4948.0,25977.0,23667.0,6827.0
1,10/12/2022,31051,41,Dixon County,NE,90.8,,,,,...,4.0,4.0,4.0,4.0,5636.0,5224.0,1031.0,4681.0,4193.0,1167.0
2,10/12/2022,21203,41,Rockcastle County,KY,94.1,7684.0,46.0,7677.0,48.6,...,7.0,7.0,8.0,7.0,16695.0,15801.0,2646.0,14468.0,13155.0,3133.0
3,10/12/2022,6007,41,Butte County,CA,97.6,132266.0,60.3,131728.0,63.6,...,4.0,4.0,4.0,3.0,219186.0,207124.0,32331.0,189828.0,174793.0,40228.0
4,10/12/2022,51660,41,Harrisonburg city,VA,78.5,22739.0,42.9,22724.0,45.2,...,1.0,1.0,1.0,1.0,53016.0,50226.0,6009.0,46866.0,44217.0,5176.0


In [11]:
df = df[df['FIPS']!='UNK']
df['FIPS'] = df['FIPS'].astype(int)
vaccination_columns = ['Administered_Dose1_Pop_Pct','Series_Complete_Pop_Pct']
df = df[['Date', 'FIPS'] + vaccination_columns]

In [12]:
df['Date'] = pd.to_datetime(df['Date'])
# df = df.sort_values(by=['FIPS', 'Date']).reset_index(drop=True)
df.fillna(0, inplace=True)

In [13]:
vaccinationOneDose = df[['Date', 'FIPS', 'Administered_Dose1_Pop_Pct']].pivot_table(
    values='Administered_Dose1_Pop_Pct',
    index=['FIPS'],
    columns='Date'
).reset_index().rename_axis(None, axis=1)
vaccinationOneDose.head()

Unnamed: 0,FIPS,2020-12-13 00:00:00,2020-12-14 00:00:00,2020-12-15 00:00:00,2020-12-16 00:00:00,2020-12-17 00:00:00,2020-12-18 00:00:00,2020-12-19 00:00:00,2020-12-20 00:00:00,2020-12-21 00:00:00,...,2022-08-10 00:00:00,2022-08-17 00:00:00,2022-08-24 00:00:00,2022-08-31 00:00:00,2022-09-07 00:00:00,2022-09-14 00:00:00,2022-09-21 00:00:00,2022-09-28 00:00:00,2022-10-05 00:00:00,2022-10-12 00:00:00
0,1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.8,56.9,56.9,56.9,57.0,57.0,57.0,57.1,57.1,57.2
1,1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,65.5,65.6,65.6,65.7,65.7,65.8,65.8,65.9,65.9,66.0
2,1005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.7,56.8,56.8,56.9,56.9,56.9,57.0,57.0,57.0,57.1
3,1007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43.1,43.1,43.2,43.2,43.2,43.2,43.2,43.2,43.3,43.3
4,1009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.1,39.1,39.1,39.1,39.2,39.2,39.2,39.2,39.2,39.3


In [14]:
vaccinationCompleteDose = df[['Date', 'FIPS', 'Series_Complete_Pop_Pct']].pivot_table(
    values='Series_Complete_Pop_Pct',
    index=['FIPS'],
    columns='Date'
).reset_index().rename_axis(None, axis=1)
vaccinationCompleteDose.head()

Unnamed: 0,FIPS,2020-12-13 00:00:00,2020-12-14 00:00:00,2020-12-15 00:00:00,2020-12-16 00:00:00,2020-12-17 00:00:00,2020-12-18 00:00:00,2020-12-19 00:00:00,2020-12-20 00:00:00,2020-12-21 00:00:00,...,2022-08-10 00:00:00,2022-08-17 00:00:00,2022-08-24 00:00:00,2022-08-31 00:00:00,2022-09-07 00:00:00,2022-09-14 00:00:00,2022-09-21 00:00:00,2022-09-28 00:00:00,2022-10-05 00:00:00,2022-10-12 00:00:00
0,1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,44.9,44.9,45.0,45.0,45.1,45.1,45.1,45.2,45.3,45.3
1,1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52.0,52.1,52.1,52.2,52.2,52.2,52.3,52.3,52.4,52.5
2,1005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,46.5,46.6,46.6,46.7,46.7,46.7,46.7,46.7,46.8,46.8
3,1007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35.7,35.7,35.8,35.8,35.8,35.8,35.8,35.9,35.9,36.0
4,1009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.3,32.4,32.4,32.4,32.4,32.5,32.5,32.5,32.6,32.6


In [15]:
vaccinationCompleteDose.to_csv(f'{output_folder}/Vaccination.csv', index=False)
vaccinationOneDose.to_csv(f'{output_folder}/VaccinationOneDose.csv', index=False)