In [None]:
import covidcast as covid
import pandas as pd
from datetime import date

### Global variables: re-fetch data or read from local csv files

In [None]:
fetch_from_covidcast = False
regenerate_state_fips = False
regenerate_county_fips = False
remerge_covid_and_fips_data = False
regenerate_usda_poverty_data = False
regenerate_usda_unemployment_median_hhi_data = False
regenerate_usda_population_data = False
regenerate_usda_education_data = False

### Fetch raw covid data and save to csv

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
if fetch_from_covidcast:
    
    # Total confirmed covid cases per 100k population
    confirmed_cumulative_cases_prop = covid.signal(data_source="usa-facts", signal="confirmed_cumulative_prop", geo_type="county")
    confirmed_cumulative_cases_prop.to_csv("data/covidcast/raw/confirmed_cumulative_cases_prop.csv")
    
    # Daily new confirmed covid cases per 100k population
    confirmed_daily_incidence_cases_prop = covid.signal(data_source="usa-facts", signal="confirmed_incidence_prop", geo_type="county")
    confirmed_daily_incidence_cases_prop.to_csv("data/covidcast/raw/confirmed_daily_incidence_cases_prop.csv")
    
    # Cumulative covid-related deaths per 100k population
    cumulative_deaths_prop = covid.signal(data_source="usa-facts", signal="deaths_cumulative_prop", geo_type="county")
    cumulative_deaths_prop.to_csv("data/covidcast/raw/cumulative_deaths_prop.csv")
    
    # Daily new covid-related deaths per 100k population
    daily_incidence_deaths_prop = covid.signal(data_source="usa-facts", signal="deaths_incidence_prop", geo_type="county")
    daily_incidence_deaths_prop.to_csv("data/covidcast/raw/daily_incidence_deaths_prop.csv")
    
    # Percentage of covid-related outpatient doctor visits
    perc_covid_doctor_visits = covid.signal(data_source='doctor-visits', signal='smoothed_adj_cli', geo_type="county")
    perc_covid_doctor_visits.to_csv("data/covidcast/raw/perc_covid_doctor_visits.csv")
    
    # Fraction of mobile devices that did not leave immediate area of home
    completely_home_prop = covid.signal(data_source='safegraph', signal='completely_home_prop', geo_type='county')
    completely_home_prop.to_csv("data/covidcast/raw/completely_home_prop.csv")
    
    # Percentage of people wearing a mask most/all the time while in public in the past 5 days
    perc_people_wearing_masks = covid.signal(data_source='fb-survey', signal='smoothed_wearing_mask', geo_type="county")
    perc_people_wearing_masks.to_csv("data/covidcast/raw/perc_people_wearing_masks.csv")
    
    # Percentage of people who were tested for covid in past 14 days
    perc_people_tested = covid.signal(data_source='fb-survey', signal='smoothed_tested_14d', geo_type='county')
    perc_people_tested.to_csv("data/covidcast/raw/perc_people_tested.csv")

    # Covid test positivity rate in past 14 days
    perc_positive_tests = covid.signal(data_source='fb-survey', signal='smoothed_tested_positive_14d', geo_type='county')
    perc_positive_tests.to_csv("data/covidcast/raw/perc_positive_tests.csv")

    # Percentage of people who wanted covid tests out of those not tested
    perc_wanted_test = covid.signal(data_source='fb-survey', signal='smoothed_wanted_test_14d', geo_type='county')
    perc_wanted_test.to_csv("data/covidcast/raw/perc_wanted_test.csv")
    
else:
    confirmed_cumulative_cases_prop = pd.read_csv("data/covidcast/raw/confirmed_cumulative_cases_prop.csv", index_col=0)
    confirmed_daily_incidence_cases_prop = pd.read_csv("data/covidcast/raw/confirmed_daily_incidence_cases_prop.csv", index_col=0)
    cumulative_deaths_prop = pd.read_csv("data/covidcast/raw/cumulative_deaths_prop.csv", index_col=0)
    daily_incidence_deaths_prop = pd.read_csv("data/covidcast/raw/daily_incidence_deaths_prop.csv", index_col=0)
    perc_covid_doctor_visits = pd.read_csv("data/covidcast/raw/perc_covid_doctor_visits.csv", index_col=0)
    completely_home_prop = pd.read_csv("data/covidcast/raw/completely_home_prop.csv", index_col=0)
    perc_people_wearing_masks = pd.read_csv("data/covidcast/raw/perc_people_wearing_masks.csv", index_col=0)
    perc_people_tested = pd.read_csv("data/covidcast/raw/perc_people_tested.csv", index_col=0)
    perc_positive_tests = pd.read_csv("data/covidcast/raw/perc_positive_tests.csv", index_col=0)
    perc_wanted_test = pd.read_csv("data/covidcast/raw/perc_wanted_test.csv", index_col=0)

### Retreive FIPS codes for each US county

In [None]:
if regenerate_state_fips:
    state_fips = pd.read_excel("data/fips/raw/state_fips_2019.xlsx", skiprows=range(5))
    state_fips = state_fips[["State (FIPS)", "Name"]]
    state_fips = state_fips.rename(columns={"State (FIPS)": "State FIPS"})
    state_fips.to_csv("data/fips/clean/state_fips_2019.csv")
else:
    state_fips = pd.read_csv("data/fips/clean/state_fips_2019.csv", index_col=0)

In [None]:
if regenerate_county_fips:
    fips = pd.read_excel('data/fips/raw/area_fips_2019.xlsx', skiprows=range(4))
    county_fips = fips[fips["Summary Level"]==50]
    county_fips = county_fips.rename(columns={"Area Name (including legal/statistical area description)": "Area Name",
                                              "State Code (FIPS)": "State FIPS",
                                              "County Code (FIPS)": "County FIPS"})
    county_fips["Area Name"] = county_fips["Area Name"].str.title()
    county_fips["FIPS"] = county_fips["State FIPS"].apply(str).str.zfill(2) + county_fips["County FIPS"].apply(str).str.zfill(3)
    county_fips["FIPS"] = county_fips["FIPS"].astype('int64')
    county_fips = county_fips[["State FIPS", "County FIPS", "FIPS", "Area Name"]]
    county_fips = county_fips.merge(state_fips, how='left', left_on="State FIPS", right_on="State FIPS")
    county_fips = county_fips.rename(columns={"Name": "State Name"})
    county_fips = county_fips[["State FIPS", "County FIPS", "FIPS", "State Name", "Area Name"]]
    county_fips.to_csv("data/fips/clean/county_fips_2019.csv")
else:
    county_fips = pd.read_csv("data/fips/clean/county_fips_2019.csv", index_col=0)    

### Merge covid datasets with fips data

In [None]:
def drop_non_county_rows(covid_df):
    return covid_df[covid_df['geo_value'] % 1000 != 0]

In [None]:
def merge_covid_and_county_fips_dfs(covid_df, county_fips_df):
    covid_df = drop_non_county_rows(covid_df)
    df = covid_df.merge(county_fips_df, how='left', left_on='geo_value', right_on='FIPS')
    df = df[["geo_value", "time_value", "issue", "value", "State Name", "Area Name"]]
    df = df.rename(columns={"geo_value": "FIPS"})
    return df

In [None]:
if False:
    confirmed_cumulative_cases_prop_fips = merge_covid_and_county_fips_dfs(confirmed_cumulative_cases_prop, county_fips)
    confirmed_daily_incidence_cases_prop_fips = merge_covid_and_county_fips_dfs(confirmed_daily_incidence_cases_prop, county_fips)
    cumulative_deaths_prop_fips = merge_covid_and_county_fips_dfs(cumulative_deaths_prop, county_fips)
    daily_incidence_deaths_prop_fips = merge_covid_and_county_fips_dfs(daily_incidence_deaths_prop, county_fips)
    perc_covid_doctor_visits_fips = merge_covid_and_county_fips_dfs(perc_covid_doctor_visits, county_fips)
    completely_home_prop_fips = merge_covid_and_county_fips_dfs(completely_home_prop, county_fips)
    perc_people_wearing_masks_fips = merge_covid_and_county_fips_dfs(perc_people_wearing_masks, county_fips)
    perc_people_tested_fips = merge_covid_and_county_fips_dfs(perc_people_tested, county_fips)
    perc_positive_tests_fips = merge_covid_and_county_fips_dfs(perc_positive_tests, county_fips)
    perc_wanted_test_fips = merge_covid_and_county_fips_dfs(perc_wanted_test, county_fips)
    
    confirmed_cumulative_cases_prop_fips.to_csv("data/covidcast/clean/confirmed_cumulative_cases_prop_fips.csv")
    confirmed_daily_incidence_cases_prop_fips.to_csv("data/covidcast/clean/confirmed_daily_incidence_cases_prop_fips.csv")
    cumulative_deaths_prop_fips.to_csv("data/covidcast/clean/cumulative_deaths_prop_fips.csv")
    daily_incidence_deaths_prop_fips.to_csv("data/covidcast/clean/daily_incidence_deaths_prop_fips.csv")
    perc_covid_doctor_visits_fips.to_csv("data/covidcast/clean/perc_covid_doctor_visits_fips.csv")
    completely_home_prop_fips.to_csv("data/covidcast/clean/completely_home_prop_fips.csv")
    perc_people_wearing_masks_fips.to_csv("data/covidcast/clean/perc_people_wearing_masks_fips.csv")
    perc_people_tested_fips.to_csv("data/covidcast/clean/perc_people_tested_fips.csv")
    perc_positive_tests_fips.to_csv("data/covidcast/clean/perc_positive_tests_fips.csv")
    perc_wanted_test_fips.to_csv("data/covidcast/clean/perc_wanted_test_fips.csv")
    
else:
    confirmed_cumulative_cases_prop_fips = pd.read_csv("data/covidcast/clean/confirmed_cumulative_cases_prop_fips.csv", index_col=0)
    confirmed_daily_incidence_cases_prop_fips = pd.read_csv("data/covidcast/clean/confirmed_daily_incidence_cases_prop_fips.csv", index_col=0)
    cumulative_deaths_prop_fips = pd.read_csv("data/covidcast/clean/cumulative_deaths_prop_fips.csv", index_col=0)
    daily_incidence_deaths_prop_fips = pd.read_csv("data/covidcast/clean/daily_incidence_deaths_prop_fips.csv", index_col=0)
    perc_covid_doctor_visits_fips = pd.read_csv("data/covidcast/clean/perc_covid_doctor_visits_fips.csv", index_col=0)
    completely_home_prop_fips = pd.read_csv("data/covidcast/clean/completely_home_prop_fips.csv", index_col=0)
    perc_people_wearing_masks_fips = pd.read_csv("data/covidcast/clean/perc_people_wearing_masks_fips.csv", index_col=0)
    perc_people_tested_fips = pd.read_csv("data/covidcast/clean/perc_people_tested_fips.csv", index_col=0)
    perc_positive_tests_fips = pd.read_csv("data/covidcast/clean/perc_positive_tests_fips.csv", index_col=0)
    perc_wanted_test_fips = pd.read_csv("data/covidcast/clean/perc_wanted_test_fips.csv", index_col=0)

### Fetch and clean USDA County-level data

#### Poverty Estimates (2018)

In [None]:
if regenerate_usda_poverty_data:
    poverty = pd.read_excel("data/usda_county_datasets/raw/PovertyEstimates.xls", skiprows=range(4))
    poverty = poverty[["FIPStxt", "Stabr", "Area_name", "PCTPOVALL_2018", "PCTPOV017_2018"]]
    poverty = poverty.rename(columns={"FIPStxt": "FIPS", 
                            "Stabr": "State Abrv", 
                            "Area_name": "Area Name", 
                            "PCTPOVALL_2018": "% Total Population in Poverty (2018)",
                            "PCTPOV017_2018": "% Children Ages 0-17 in Poverty (2018)"})
    poverty.to_csv("data/usda_county_datasets/clean/poverty_2018.csv")
else:
    poverty = pd.read_csv("data/usda_county_datasets/clean/poverty_2018.csv", index_col=0)

#### Unemployment and Median Household Income Estimates (2018)

In [None]:
if regenerate_usda_unemployment_median_hhi_data:
    unemployment_median_hhi = pd.read_excel("data/usda_county_datasets/raw/Unemployment.xls", skiprows=range(7))
    unemployment_median_hhi = unemployment_median_hhi[["FIPStxt", "Stabr", "area_name", "Unemployment_rate_2018", "Median_Household_Income_2018", "Med_HH_Income_Percent_of_State_Total_2018"]]
    unemployment_median_hhi = unemployment_median_hhi.rename(columns={"FIPStxt": "FIPS",
                                            "Stabr": "State Abrv",
                                            "area_name": "Area Name",
                                            "Unemployment_rate_2018": "% Unemployment (2018)",
                                            "Median_Household_Income_2018": "Median Household Income (2018)",
                                            "Med_HH_Income_Percent_of_State_Total_2018": "Median Household Income % of State Total (2018)"})
    unemployment_median_hhi.to_csv("data/usda_county_datasets/clean/unemployment_median_hhi_2018.csv")
else:
    unemployment_median_hhi = pd.read_csv("data/usda_county_datasets/clean/unemployment_median_hhi_2018.csv", index_col=0)
    

#### Population Estimates (2018)

In [None]:
if regenerate_usda_population_data:
    population = pd.read_excel("data/usda_county_datasets/raw/PopulationEstimates.xls", skiprows=range(2))
    population = population[["FIPStxt", "State", "Area_Name", "POP_ESTIMATE_2018"]]
    population = population.rename(columns={"FIPStxt": "FIPS",
                               "State": "State Abrv",
                               "Area_Name": "Area Name",
                               "POP_ESTIMATE_2018": "Population Estimate (2018)"})
    population.to_csv("data/usda_county_datasets/clean/population_2018.csv")
else:
    population = pd.read_csv("data/usda_county_datasets/clean/population_2018.csv", index_col=0)

#### Education Estimates (2014-2018)

In [None]:
if regenerate_usda_education_data:
    education = pd.read_excel("data/usda_county_datasets/raw/Education.xls", skiprows=range(4))
    education = education[["FIPS Code", "State", "Area name", 
                           "Percent of adults with less than a high school diploma, 2014-18",
                           "Percent of adults with a high school diploma only, 2014-18",
                           "Percent of adults completing some college or associate's degree, 2014-18",
                           "Percent of adults with a bachelor's degree or higher, 2014-18"]]
    education = education.rename(columns={"FIPS Code": "FIPS",
                              "State": "State Abrv",
                              "Area name": "Area Name",
                              "Percent of adults with less than a high school diploma, 2014-18": "% Adults Incomplete High School (2018)",
                              "Percent of adults with a high school diploma only, 2014-18": "% Adults Complete High School (2018)",
                              "Percent of adults completing some college or associate's degree, 2014-18": "% Adults Some College (2018)",
                              "Percent of adults with a bachelor's degree or higher, 2014-18": "% Adults Complete College (2018)"})
    education.to_csv("data/usda_county_datasets/clean/education_2018.csv")
else:
    education = pd.read_csv("data/usda_county_datasets/clean/education_2018.csv", index_col=0)
