In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [13]:
# url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.htm"
# soup = BeautifulSoup(requests.get(url).text, "html.parser")

# records = []
# for h3 in soup.find_all("h3"):
#     var_name = h3.get_text(strip=True).replace("Variable:", "").strip()
#     bold = h3.find_next("b", string="English Text:")
#     desc = bold.next_sibling.strip() if bold and bold.next_sibling else ""
#     if var_name:
#         records.append((var_name, desc))

# df = pd.DataFrame(records, columns=["Variable", "Description"])
# df = df[df["Variable"].str.contains(" - ", na=False)]
# df[["Code", "Variable_Description"]] = df["Variable"].str.split(" - ", n=1, expand=True)
# df = df[["Code", "Variable_Description", "Description"]].reset_index(drop=True)

# print(df.head())

In [14]:
# url = 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary&Cycle=2017-2020'
# soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# data = []
# for table in soup.find_all('table'):
#     for row in table.find_all('tr')[1:]:  # Skip header
#         cols = row.find_all('td')
#         if len(cols) >= 3:
#             def extract_link(cell):
#                 tag = cell.find('a')
#                 text = tag.get_text(strip=True) if tag else cell.get_text(strip=True)
#                 url = tag.get('href', '') if tag else ''
#                 url = 'https://wwwn.cdc.gov' + url if url and not url.startswith('http') else url
#                 return text, url
            
#             doc_text, doc_url = extract_link(cols[2])
#             data_file, data_url = extract_link(cols[3]) if len(cols) >= 4 else ('', '')
            
#             data.append({
#                 'Years': cols[0].get_text(strip=True),
#                 'Data File Name': cols[1].get_text(strip=True),
#                 'Doc File': doc_text,
#                 'Doc URL': doc_url,
#                 'Data File': data_file,
#                 'Data URL': data_url,
#                 'Date Published': cols[4].get_text(strip=True) if len(cols) >= 5 else ''
#             })

# df = pd.DataFrame(data)
# print(f"Total rows: {len(df)}\n\n{df.to_string()}")

In [18]:
def extract_link(cell):
    tag = cell.find('a')
    text = tag.get_text(strip=True) if tag else cell.get_text(strip=True)
    url = tag.get('href', '') if tag else ''
    return text, ('https://wwwn.cdc.gov' + url if url and not url.startswith('http') else url)

# Get data files
soup = BeautifulSoup(requests.get('https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary&Cycle=2017-2020').content, 'html.parser')
data = []
for row in [r for t in soup.find_all('table') for r in t.find_all('tr')[1:]]:
    cols = row.find_all('td')
    if len(cols) >= 3:
        doc_text, doc_url = extract_link(cols[2])
        data_file, data_url = extract_link(cols[3]) if len(cols) >= 4 else ('', '')
        data.append({'Years': cols[0].get_text(strip=True), 'Data File Name': cols[1].get_text(strip=True),
                     'Doc File': doc_text, 'Doc URL': doc_url, 'Data File': data_file, 'Data URL': data_url,
                     'Date Published': cols[4].get_text(strip=True) if len(cols) >= 5 else ''})

df_files = pd.DataFrame(data)
print(f"Found {len(df_files)} data files\n")

# Extract variables from each doc
all_variables = []
for idx, row in df_files.iterrows():
    if not row['Doc URL']:
        continue
    print(f"Processing: {row['Data File Name']}...")
    soup = BeautifulSoup(requests.get(row['Doc URL']).text, "html.parser")
    
    records = [(h3.get_text(strip=True).replace("Variable:", "").strip(),
                h3.find_next("b", string="English Text:").next_sibling.strip() if (bold := h3.find_next("b", string="English Text:")) and bold.next_sibling else "")
               for h3 in soup.find_all("h3")
               if (var := h3.get_text(strip=True).replace("Variable:", "").strip()) and " - " in var and re.match(r'^[A-Z0-9]+$', var.split(" - ")[0].strip())]
    
    if records:
        df_vars = pd.DataFrame(records, columns=["Variable", "Description"])
        df_vars[["Code", "Variable_Description"]] = df_vars["Variable"].str.split(" - ", n=1, expand=True)
        df_vars['Data_File'] = row['Data File Name']
        all_variables.append(df_vars[["Code", "Variable_Description", "Description", "Data_File"]])

df_all = pd.concat(all_variables, ignore_index=True) if all_variables else pd.DataFrame()

# drop column description from df_all
df_all = df_all.drop(columns=['Description'])

print(f"\nTotal variables extracted: {len(df_all)}\n{df_all.head(20)}")

Found 14 data files

Processing: Dietary Interview - Individual Foods, First Day...
Processing: Dietary Interview - Individual Foods, Second Day...
Processing: Dietary Interview - Total Nutrient Intakes, First Day...
Processing: Dietary Interview - Total Nutrient Intakes, Second Day...
Processing: Dietary Interview Technical Support File - Food Codes...
Processing: Dietary Supplement Database - Blend Information...
Processing: Dietary Supplement Database - Ingredient Information...
Processing: Dietary Supplement Database - Product Information...
Processing: Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day...
Processing: Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day...
Processing: Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day...
Processing: Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day...
Processing: Dietary Supplement Use 30-Day - Individual Dietary Supplements...
Processing: Dietar

In [19]:
# export to csv
df_all.to_csv('TABLES/data_dict_dietary.csv', index=False)