In [1]:
import pandas as pd
import io, requests, os 
from bs4 import BeautifulSoup
from tqdm import tqdm

output_folder = 'CovidMay17-2022'
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

# PVI data

In [None]:
# read in fips codes
fips = pd.read_csv(
    "https://github.com/COVID19PVI/data/raw/master/Model11.2.1/data/Model_11.2.1_20200228_data.csv", 
    header=12, usecols=["name","casrn"]
).rename(columns={'name':'Name','casrn':'FIPS'})

# get the PVI data links
r = requests.get("https://github.com/COVID19PVI/data/tree/master/Model11.2.1")
soup = BeautifulSoup(r.content, 'html.parser')
links = [
    x.attrs["href"].replace("blob","raw")
    for x in soup.find_all('a', {'class':'js-navigation-open Link--primary'})[1:-1]
]

In [3]:
# combine data
all_data = []
for link in tqdm(links):
    new_data = pd.read_csv("https://github.com"+link)
    new_data["date"] = link.split("_")[2]
    all_data.append(new_data)
all_data = pd.concat(all_data)
all_data["date"] = [f"{i[:4]}-{i[4:6]}-{i[6:]}" for i in all_data.date]
all_data.head()

100%|██████████| 810/810 [06:57<00:00,  1.94it/s]  


Unnamed: 0,ToxPi Score,HClust Group,KMeans Group,Name,Source,Infection Rate: Transmissible Cases!25!0xcc3333ff,Infection Rate: Disease Spread!5!0xe64d4dff,Pop Concentration: Pop Mobility!10!0x57b757ff,Pop Concentration: Residential Density!10!0x5ced5cff,Intervention: Social Distancing!10!0x4258c9ff,Intervention: Testing!10!0x6079f7ff,Health & Environment: Pop Demographics!10!0x6b0b9eff,Health & Environment: Air Pollution!10!0x8e26c4ff,Health & Environment: Age Distribution!10!0x9a42c8ff,Health & Environment: Co-morbidities!10!0xb460e0ff,Health & Environment: Health Disparities!10!0xc885ecff,Health & Environment: Hospital Beds!5!0xdeb9f1ff,date
0,0.659739,1,10,"California, Solano","-121.9357,38.2683",1.0,1.0,0.717074,0.6905,1.0,0.504366,0.622993,0.479042,0.504195,0.236237,0.235189,0.514283,2020-02-28
1,0.624339,2,10,"California, Madera","-119.7666,37.2151",0.638652,1.0,0.601609,0.8239,0.825,0.504366,0.605671,0.658683,0.469686,0.321223,0.590624,0.61368,2020-02-28
2,0.598916,1,10,"California, Santa Cruz","-122.0067,37.0558",0.817887,1.0,0.704589,0.9035,1.0,0.504366,0.409785,0.269461,0.511724,0.100021,0.272557,0.531471,2020-02-28
3,0.590967,2,10,"California, Imperial","-115.3669,33.0393",0.591782,0.666667,0.585525,0.9869,0.75,0.504366,0.525758,0.628743,0.443774,0.270404,0.620412,0.51683,2020-02-28
4,0.578017,1,10,"California, Santa Clara","-121.697,37.231",0.830152,0.88,0.793513,0.731,1.0,0.504366,0.452019,0.353293,0.452613,0.060895,0.131427,0.461418,2020-02-28


In [4]:
drop_columns = ['Name', 'Source','date', 'ToxPi Score', 'HClust Group', 'KMeans Group', 'Hospital Beds']
for column in all_data.columns:
    column_name = column.split('!')[0]
    if ":" in column_name:
        column_name = column_name.split(": ")[1]

    if column_name in drop_columns:
        continue
    print(f"Saving {column_name}", end = "... ")
    mx = all_data.pivot(index="Name",columns="date", values=column).reset_index()
    mx = pd.merge(mx, fips, on="Name", how="left")
    mx.insert(0, "FIPS", mx.pop("FIPS"))
    mx.to_csv(f"{output_folder}/{column_name}.csv", index=False, float_format='%.4f')
    print(u'\u2705')

Saving ToxPi Score... ✅
Saving HClust Group... ✅
Saving KMeans Group... ✅
Saving Transmissible Cases... ✅
Saving Disease Spread... ✅
Saving Pop Mobility... ✅
Saving Residential Density... ✅
Saving Social Distancing... ✅
Saving Testing... ✅
Saving Pop Demographics... ✅
Saving Air Pollution... ✅
Saving Age Distribution... ✅
Saving Co-morbidities... ✅
Saving Health Disparities... ✅
Saving Hospital Beds... ✅


# Cases
Source [USAFacts](https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/). Data collection starts from 2020-01-22.

In [2]:
response = requests.get("https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv")

file_object = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(file_object)
df.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-05-16,2022-05-17,2022-05-18,2022-05-19,2022-05-20,2022-05-21,2022-05-22,2022-05-23,2022-05-24,2022-05-25
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,15870,15871,15873,15885,15891,15891,15891,15904,15904,15916
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,55916,55941,55960,55996,56031,56031,56031,56134,56134,56222
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,5688,5688,5691,5691,5692,5692,5692,5694,5694,5694
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,6463,6465,6466,6468,6469,6469,6469,6473,6473,6479


In [3]:
df.drop(columns=['County Name', 'State', 'StateFIPS'], inplace=True)
df.rename({'countyFIPS':'FIPS'}, axis=1, inplace=True)

df = df[(~df['FIPS'].isna()) & (df['FIPS']>0)]
df.fillna(0, inplace=True)
df.to_csv(f'{output_folder}/Cases.csv', index=False)

# Deaths
Source [USAFacts](https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/). Data collection starts from 2020-01-22.

In [7]:
response = requests.get("https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv")
file_object = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(file_object)
df.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2022-05-06,2022-05-07,2022-05-08,2022-05-09,2022-05-10,2022-05-11,2022-05-12,2022-05-13,2022-05-14,2022-05-15
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,216,216,216,216,216,216,216,216,216,216
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,680,680,680,680,680,680,680,681,681,681
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,98,98,98,98,98,98,98,98,98,98
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,104,104,104,104,104,104,105,105,105,105


In [8]:
df.drop(columns=['County Name', 'State', 'StateFIPS'], inplace=True)
df.rename({'countyFIPS':'FIPS'}, axis=1, inplace=True)

df = df[(~df['FIPS'].isna()) & (df['FIPS']>0)]
df.fillna(0, inplace=True)
df.to_csv(f'{output_folder}/Deaths.csv', index=False)

# Vaccination
From [CDC vaccinations in the US counties](https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-County/8xkx-amqh). Data collection starts from 2020-12-13.

In [9]:
response = requests.get("https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD")
file_object = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(file_object, low_memory=False)
df.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_5PlusPop_Pct,...,Booster_Doses_Vax_Pct_UR_Equity,Booster_Doses_12PlusVax_Pct_UR_Equity,Booster_Doses_18PlusVax_Pct_UR_Equity,Booster_Doses_65PlusVax_Pct_UR_Equity,Census2019,Census2019_5PlusPop,Census2019_5to17Pop,Census2019_12PlusPop,Census2019_18PlusPop,Census2019_65PlusPop
0,05/17/2022,47037,20,Davidson County,TN,97.8,514191.0,74.1,514095.0,79.3,...,3.0,3.0,4.0,3.0,694144.0,648128.0,97057.0,593793.0,551071.0,87023.0
1,05/17/2022,47095,20,Lake County,TN,97.8,2782.0,39.7,2780.0,41.3,...,6.0,6.0,6.0,6.0,7016.0,6727.0,717.0,6358.0,6010.0,1175.0
2,05/17/2022,48051,20,Burleson County,TX,98.9,10195.0,55.3,10191.0,58.8,...,3.0,3.0,3.0,2.0,18443.0,17346.0,2919.0,15779.0,14427.0,3867.0
3,05/17/2022,48117,20,Deaf Smith County,TX,98.9,9307.0,50.2,9302.0,54.9,...,6.0,6.0,6.0,6.0,18546.0,16931.0,4105.0,14710.0,12826.0,2423.0
4,05/17/2022,48219,20,Hockley County,TX,98.9,11435.0,49.7,11433.0,53.3,...,6.0,6.0,6.0,6.0,23021.0,21465.0,4356.0,19132.0,17109.0,3453.0


In [10]:
df = df[df['FIPS']!='UNK']
df['FIPS'] = df['FIPS'].astype(int)
vaccination_columns = ['Administered_Dose1_Pop_Pct','Series_Complete_Pop_Pct']
df = df[['Date', 'FIPS'] + vaccination_columns]

In [11]:
df['Date'] = pd.to_datetime(df['Date'])
# df = df.sort_values(by=['FIPS', 'Date']).reset_index(drop=True)
df.fillna(0, inplace=True)

In [12]:
vaccinationOneDose = df[['Date', 'FIPS', 'Administered_Dose1_Pop_Pct']].pivot_table(
    values='Administered_Dose1_Pop_Pct',
    index=['FIPS'],
    columns='Date'
).reset_index().rename_axis(None, axis=1)
vaccinationOneDose.head()

Unnamed: 0,FIPS,2020-12-13 00:00:00,2020-12-14 00:00:00,2020-12-15 00:00:00,2020-12-16 00:00:00,2020-12-17 00:00:00,2020-12-18 00:00:00,2020-12-19 00:00:00,2020-12-20 00:00:00,2020-12-21 00:00:00,...,2022-05-08 00:00:00,2022-05-09 00:00:00,2022-05-10 00:00:00,2022-05-11 00:00:00,2022-05-12 00:00:00,2022-05-13 00:00:00,2022-05-14 00:00:00,2022-05-15 00:00:00,2022-05-16 00:00:00,2022-05-17 00:00:00
0,1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55.8,55.8,55.8,55.8,55.8,55.8,55.8,55.8,55.8,55.8
1,1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,64.5,64.5,64.5,64.5,64.5,64.5,64.6,64.6,64.6,64.6
2,1005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55.7,55.7,55.7,55.7,55.7,55.8,55.8,55.8,55.8,55.8
3,1007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.6,42.6,42.6,42.6,42.6,42.6,42.6,42.6,42.6,42.6
4,1009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.4,38.4,38.4,38.4,38.4,38.5,38.5,38.5,38.5,38.5


In [13]:
vaccinationCompleteDose = df[['Date', 'FIPS', 'Series_Complete_Pop_Pct']].pivot_table(
    values='Series_Complete_Pop_Pct',
    index=['FIPS'],
    columns='Date'
).reset_index().rename_axis(None, axis=1)
vaccinationCompleteDose.head()

Unnamed: 0,FIPS,2020-12-13 00:00:00,2020-12-14 00:00:00,2020-12-15 00:00:00,2020-12-16 00:00:00,2020-12-17 00:00:00,2020-12-18 00:00:00,2020-12-19 00:00:00,2020-12-20 00:00:00,2020-12-21 00:00:00,...,2022-05-08 00:00:00,2022-05-09 00:00:00,2022-05-10 00:00:00,2022-05-11 00:00:00,2022-05-12 00:00:00,2022-05-13 00:00:00,2022-05-14 00:00:00,2022-05-15 00:00:00,2022-05-16 00:00:00,2022-05-17 00:00:00
0,1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,44.3,44.3,44.4,44.4,44.4,44.4,44.4,44.4,44.4,44.4
1,1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,51.0,51.0,51.1,51.1,51.1,51.1,51.1,51.1,51.2,51.2
2,1005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,45.5,45.5,45.5,45.5,45.5,45.5,45.6,45.6,45.6,45.6
3,1007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,34.8,34.8,34.8,34.8,34.8,34.8,34.8,34.8,34.8,34.8
4,1009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,31.8,31.8,31.8,31.8,31.8,31.8,31.8,31.8,31.8,31.8


In [14]:
vaccinationCompleteDose.to_csv(f'{output_folder}/Vaccination.csv', index=False)
vaccinationOneDose.to_csv(f'{output_folder}/VaccinationOneDose.csv', index=False)