In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import os

In [None]:
# --- CONFIGURATION ---

USER_AGENT = "Mozilla/5.0 (compatible; NHANES-scraper/1.0)"
DEFAULT_TIMEOUT = 30

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": USER_AGENT})

DEFAULT_LOGFILE = "log.txt"
CDC_BASE_URL = "https://wwwn.cdc.gov"
NHANES_BASE_URL = "https://wwwn.cdc.gov/nchs/nhanes/"
CYCLE_2017_2020_ROOT_URL = "https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2017-2020" # Only using 2017 - 2020 for now

# --- END CONFIGURATION ---

# --- HELPER FUNCTIONS ---
def log(message, logfile="log.txt"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(logfile, "a", encoding="utf-8") as f:
        f.write(f"[{timestamp}] {message}\n")

def clean_text(text: str):
    text = text.strip()
    text = text.replace(" ", "_")
    text = text.lower()
    return text

def clean_href(base_url, a):
    try:
        if not a:
            raise Exception("Anchor doesn't exist!")
        if not a.has_attr("href"):
            raise Exception("Anchor doesn't have href!")
        
        href = a["href"].strip()
        if href.startswith("../"): 
            href = base_url + href[3:] 
        else: 
            href = base_url + href 
            return href
        
        return href
    except Exception as e:
        log(f"Expt in clean_href: {str(e)}")
        return None

def get_soup(url) -> BeautifulSoup:
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

# --- END HELPER FUNCTIONS ---

# --- SCRAPING FUNCTIONS ---

# DDC = Data, Documentation, Codebook
def get_ddc_links(nhanes_root_soup: BeautifulSoup) -> dict:
    table = nhanes_root_soup.find(
        "div",
        attrs = {'class': 'card-body bg-white no-padding'}
    )
    a_elements = table.find_all('a')

    links = {}
    for a_element in a_elements:
        name = clean_text(a_element.contents[-1])
        link = clean_href(NHANES_BASE_URL, a_element)
        links[name] = link
    return links


def get_category_subset_df(url) -> pd.DataFrame:
    soup = get_soup(url)
    table = soup.find('table', {'id': 'GridView1'}) # Tables are easily identified by id
    # Keep it really simply, get headers, get rows + links, create df

    # Get natural headers
    headers = []
    for th in table.find('thead').find_all('th'):
        headers.append(clean_text(th.contents[0]))

    # Create array of dictionaries from rows:
    rows = []
    for tr in table.find("tbody").find_all("tr"):
        tds = tr.find_all("td")

        # basic text
        text_data = [
            td.get_text(strip=True) for td in tds
        ]

        # pick out anchors
        doc_a  = tds[2].find("a")
        data_a = tds[3].find("a")

        # build absolute URLs manually
        doc_url  = clean_href(CDC_BASE_URL, doc_a)
        data_url = clean_href(CDC_BASE_URL, data_a)

        row = dict(zip(headers, text_data))
        row["doc_url"] = doc_url
        row["data_url"] = data_url

        rows.append(row)
    
    # Create df
    df = pd.DataFrame(rows)
    return df

def download_file(url, file_dir, write_log = True):
    r = SESSION.get(url, stream=True)
    r.raise_for_status()

    file_name = os.path.basename(url)
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    save_path = os.path.join(file_dir, file_name)
    with open(save_path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=16*1024):
            f.write(chunk)
    
    if write_log:
        log(f"{file_name} downloaded successfully to {save_path}")

def download_category_subset(cycle, category_subset):
    for row in cycle[category_subset].to_dict("records"):
        url = row['data_url']
        if not url:
            log(f"{category_subset}: No data url found! Cannot download")
            continue
        download_file(url, "./RAW/DATA/" + category_subset)
        print(f"Successfully downloaded {url}")

def get_codebook_df(doc_url):
    results = []
    soup = get_soup(doc_url)
    codebook = soup.find('div', {'id': 'Codebook'})
    if not codebook:
        print(f"Can't find codebook for {doc_url}!")
        log(f"Can't find codebook for {doc_url}!")
        return pd.DataFrame()
    for div in codebook.find_all('div', {'class': 'pagebreak'}):
        code = {}
        dl = div.find('dl')
        for dt in dl.find_all('dt'):
            dd = dt.find_next_sibling('dd')

            code_feature_name = clean_text(dt.get_text()).rstrip(":")
            code_feature_value = dd.get_text()
            code[code_feature_name] = code_feature_value
            
        results.append(code)
    return pd.DataFrame(results)

def get_category_subset_codebook(cycle, category_subset):
    for row in cycle[category_subset].to_dict("records"):
        doc_url = row['doc_url']
        if not doc_url:
            log(f"{category_subset}: No doc url found! Cannot get codebook")
            continue
        CODEBOOKS_PATH = f"./RAW/CODEBOOKS/{category_subset}"

        if not os.path.exists(CODEBOOKS_PATH):
            os.makedirs(CODEBOOKS_PATH)
        url = row['data_url']
        if not url:
            log(f"{category_subset}: No data url found! Cannot download")
            continue
        file_name = os.path.basename(url).split('.')[0]
        doc_df = get_codebook_df(doc_url)
        doc_df['data_file'] = file_name
        doc_df.to_csv(CODEBOOKS_PATH + f"/{file_name}_codebook.csv", index = False)
# --- END SCRAPING FUNCTIONS ---

In [3]:
# Get links from the main root cycle page
links = get_ddc_links(get_soup(CYCLE_2017_2020_ROOT_URL))

# Create dataframes with subcategory information
cycle_2017_2020_dfs = {}
for name, link in links.items():
    print(name, link)
    cycle_2017_2020_dfs[name] = get_category_subset_df(link)

# For subcategory, download the data
for category_subset in cycle_2017_2020_dfs.keys():
    download_category_subset(cycle_2017_2020_dfs, category_subset)

# For subcategory, get the codebooks
for category_subset in cycle_2017_2020_dfs.keys():
    get_category_subset_codebook(cycle_2017_2020_dfs, category_subset)

demographics_data https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics&Cycle=2017-2020
dietary_data https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary&Cycle=2017-2020
examination_data https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&Cycle=2017-2020
laboratory_data https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory&Cycle=2017-2020
questionnaire_data https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&Cycle=2017-2020
limited_access_data https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=LimitedAccess&Cycle=2017-2020
Successfully downloaded https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt
Successfully downloaded https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DR1IFF.xpt
Successfully downloaded https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DR2IFF.xpt
Successfully downloaded https://wwwn.cdc.gov/Nchs/Data/N

# Butchering together harvested data (IN PROGRESS🚧)

In [None]:
from pathlib import Path

root_folder = Path("RAW/DATA")

data_file_count = 0
for folder in root_folder.iterdir():
    if not folder.is_dir():
        break
    for file in folder.glob("*.xpt"):
        print(file)
        data_file_count += 1
        # print(file.name)
        # print(file.stem)

print(f"TOTAL FILES: {data_file_count}")


RAW\DATA\demographics_data\P_DEMO.xpt
RAW\DATA\dietary_data\DSBI.xpt
RAW\DATA\dietary_data\DSII.xpt
RAW\DATA\dietary_data\DSPI.xpt
RAW\DATA\dietary_data\P_DR1IFF.xpt
RAW\DATA\dietary_data\P_DR1TOT.xpt
RAW\DATA\dietary_data\P_DR2IFF.xpt
RAW\DATA\dietary_data\P_DR2TOT.xpt
RAW\DATA\dietary_data\P_DRXFCD.xpt
RAW\DATA\dietary_data\P_DS1IDS.xpt
RAW\DATA\dietary_data\P_DS1TOT.xpt
RAW\DATA\dietary_data\P_DS2IDS.xpt
RAW\DATA\dietary_data\P_DS2TOT.xpt
RAW\DATA\dietary_data\P_DSQIDS.xpt
RAW\DATA\dietary_data\P_DSQTOT.xpt
RAW\DATA\examination_data\P_AUX.xpt
RAW\DATA\examination_data\P_AUXAR.xpt
RAW\DATA\examination_data\P_AUXTYM.xpt
RAW\DATA\examination_data\P_AUXWBR.xpt
RAW\DATA\examination_data\P_BMX.xpt
RAW\DATA\examination_data\P_BPXO.xpt
RAW\DATA\examination_data\P_DXXFEM.xpt
RAW\DATA\examination_data\P_DXXSPN.xpt
RAW\DATA\examination_data\P_LUX.xpt
RAW\DATA\examination_data\P_OHXDEN.xpt
RAW\DATA\examination_data\P_OHXREF.xpt
RAW\DATA\laboratory_data\P_ALB_CR.xpt
RAW\DATA\laboratory_data\P_BI