In [1]:
import pandas as pd
import numpy as np
from sodapy import Socrata

In [2]:
client = Socrata("data.cdc.gov", None)
# don't allow missing values for hosp_yn, icu_yn, death_yn, sex, race, ethnicity, and age_group
where = 'hosp_yn != "Missing" and hosp_yn != "Unknown" and ' + \
        'icu_yn != "Missing" and icu_yn != "Unknown" and ' + \
        'death_yn != "Missing" and death_yn != "Unknown" and death_yn != "NA" and ' + \
        'sex != "NA" and race != "NA" and ethnicity != "NA" and age_group != "Missing"'
results = client.get("n8mc-b4w4", where=where, limit=1_000_000_000_000) # large limit to get all data
cdc_df = pd.DataFrame.from_records(results)



In [3]:
print(f"Number of rows: {len(cdc_df):,}")
cdc_df.head()

Number of rows: 1,541,620


Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-10,NV,32,CLARK,32003,0 - 17 years,Male,White,Non-Hispanic/Latino,0,0.0,Routine surveillance,Missing,Laboratory-confirmed case,Symptomatic,No,No,No,
1,2022-03,OH,39,CUYAHOGA,39035,18 to 49 years,Female,White,Non-Hispanic/Latino,0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
2,2020-10,NV,32,CLARK,32003,0 - 17 years,Male,White,Non-Hispanic/Latino,0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,
3,2020-10,NV,32,CLARK,32003,0 - 17 years,Male,White,Non-Hispanic/Latino,0,0.0,Routine surveillance,Missing,Laboratory-confirmed case,Symptomatic,No,No,No,
4,2022-03,OH,39,CUYAHOGA,39035,18 to 49 years,Female,White,Non-Hispanic/Latino,0,,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,No


In [4]:
print("=== Unique values of hosp_yn, icu_yn, and death_yn ===")
hosp = pd.unique(cdc_df['hosp_yn'])
print(f"{hosp} ({len(hosp)})")
icu = pd.unique(cdc_df['icu_yn'])
print(f"{icu} ({len(icu)})")
death = pd.unique(cdc_df['death_yn'])
print(f"{death} ({len(death)})")

=== Unique values of hosp_yn, icu_yn, and death_yn ===
['No' 'Yes'] (2)
['No' 'Yes'] (2)
['No' 'Yes'] (2)


In [5]:
print("=== Unique values of case_month, res_state, and age_group ===")
month = np.sort(pd.unique(cdc_df['case_month']))
print(f"{month} ({len(month)})")
state = np.sort(pd.unique(cdc_df['res_state']))
print(f"{state} ({len(state)})")
age = np.sort(pd.unique(cdc_df['age_group']))
print(f"{age} ({len(age)})")

=== Unique values of case_month, res_state, and age_group ===
['2020-02' '2020-03' '2020-04' '2020-05' '2020-06' '2020-07' '2020-08'
 '2020-09' '2020-10' '2020-11' '2020-12' '2021-01' '2021-02' '2021-03'
 '2021-04' '2021-05' '2021-06' '2021-07' '2021-08' '2021-09' '2021-10'
 '2021-11' '2021-12' '2022-01' '2022-02' '2022-03' '2022-04' '2022-05'
 '2022-06' '2022-07' '2022-08' '2022-09' '2022-10'] (33)
['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'DC' 'FL' 'GA' 'GU' 'IA' 'ID' 'IL' 'IN'
 'KS' 'KY' 'LA' 'MA' 'MD' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC' 'ND' 'NH' 'NJ'
 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'PR' 'SC' 'TN' 'TX' 'UT' 'VA' 'VI' 'VT'
 'WA' 'WI' 'WY'] (45)
['0 - 17 years' '18 to 49 years' '50 to 64 years' '65+ years'] (4)


In [7]:
cdc_df.to_csv("data/cdc_covid_data.csv")