<a href="https://colab.research.google.com/github/ConstructoDestructo/Diabetes_AI_Instrument/blob/main/CDC_NHANES_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# =============================================================================
# CHUNK 1: WEB CRAWLER & CATALOG BUILDER
# =============================================================================
"""
Purpose: Scrape CDC NHANES website for all dataset metadata
Output: nhanes_catalog.json, nhanes_catalog.csv (1,716 datasets)
Time: ~5-10 minutes
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import re

# Configuration
BASE_CYCLE_INDEX = "https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx"
COMPONENTS = ["Demographics", "Dietary", "Examination", "Laboratory", "Questionnaire"]
OUTPUT_JSON = "nhanes_catalog.json"
OUTPUT_CSV = "nhanes_catalog.csv"
HEADERS = {"User-Agent": "Mozilla/5.0 (Data Research Bot; +https://example.com)"}
EXCLUDE = ["What We Eat in America", "Notice to Users"]

# Helper functions
def get_available_cycles():
    """Get list of available NHANES cycles from CDC website"""
    res = requests.get(BASE_CYCLE_INDEX, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    cycles = []
    for link in soup.find_all("a", href=True):
        href = link["href"]
        link_text = link.text.strip()
        if ("BeginYear=" in href or "Cycle=" in href) and "NHANES" in link_text:
            if any(exclude in link_text for exclude in EXCLUDE):
                continue
            if link_text not in cycles:
                cycles.append(link_text)
    return cycles

def get_component_datasets(cycle, component):
    """Get all datasets for a specific cycle and component"""
    url = f"https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={component}&Cycle={cycle}"
    res = requests.get(url, headers=HEADERS)
    if res.status_code != 200:
        print(f"⚠️ Failed to fetch {component} data for {cycle} (status {res.status_code})")
        return []

    soup = BeautifulSoup(res.text, "html.parser")
    table = soup.find("table", {"id": "GridView1"})
    if not table:
        return []

    # Extract cycle years for filtering
    cycle_years_match = re.findall(r'\d{4}', cycle)
    if len(cycle_years_match) == 2:
        cycle_start_year = int(cycle_years_match[0])
        cycle_end_year = int(cycle_years_match[1])
    elif len(cycle_years_match) == 1:
        cycle_start_year = cycle_end_year = int(cycle_years_match[0])
    else:
        cycle_start_year = cycle_end_year = None

    records = []
    rows = table.find_all("tr")[1:]  # skip header
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 4:
            continue
        dataset_name = cols[0].text.strip()
        years = cols[1].text.strip()
        data_link = cols[2].find("a")["href"] if cols[2].find("a") else None
        doc_link = cols[3].find("a")["href"] if cols[3].find("a") else None

        # Swap if years and dataset_name got mixed up
        if re.match(r'\d{4}-\d{4}', dataset_name):
            dataset_name, years = years, dataset_name

        # Fix relative URLs
        if data_link and data_link.startswith("/"):
            data_link = "https://wwwn.cdc.gov" + data_link
        if doc_link and doc_link.startswith("/"):
            doc_link = "https://wwwn.cdc.gov" + doc_link

        # Filter by cycle years
        dataset_years_match = re.search(r'(\d{4})-(\d{4})', years)
        if dataset_years_match:
            dataset_start_year = int(dataset_years_match.group(1))
            dataset_end_year = int(dataset_years_match.group(2))
            if cycle_start_year and cycle_end_year:
                if dataset_start_year < cycle_start_year or dataset_end_year > cycle_end_year:
                    continue

        print(f"   [Found] {dataset_name} ({years})")
        records.append({
            "cycle": cycle,
            "component": component,
            "dataset_name": dataset_name,
            "years": years,
            "data_url": doc_link,  # swapped on purpose (XPT download link)
            "doc_url": data_link
        })
    return records

def crawl_nhanes():
    """Main crawler function - scrapes all NHANES cycles and components"""
    all_records = []
    cycles = get_available_cycles()
    print(f"✅ Found {len(cycles)} valid NHANES cycles: {cycles}")

    for cycle in tqdm(cycles, desc="Crawling NHANES cycles"):
        print(f"\n🔹 Scanning cycle: {cycle}")

        # Always include Demographics first
        try:
            demographics_datasets = get_component_datasets(cycle, "Demographics")
            if demographics_datasets:
                all_records.extend(demographics_datasets)
                print(f"   [Added] Demographics dataset for {cycle}")
            time.sleep(1)
        except Exception as e:
            print(f"⚠️ Error fetching demographics for {cycle}: {e}")

        # Add the rest of the components
        for component in COMPONENTS:
            if component == "Demographics":
                continue
            try:
                datasets = get_component_datasets(cycle, component)
                all_records.extend(datasets)
                time.sleep(1)
            except Exception as e:
                print(f"⚠️ Error on {cycle} - {component}: {e}")

    df = pd.DataFrame(all_records)
    df.to_json(OUTPUT_JSON, orient="records", indent=2)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n✅ Crawling complete! Saved {len(df)} records to {OUTPUT_JSON} and {OUTPUT_CSV}.")
    return df

# Run the crawler
if __name__ == "__main__":
    df_catalog = crawl_nhanes()


✅ Found 13 valid NHANES cycles: ['NHANES 08/2021-08/2023', 'NHANES 2017-March 2020', 'NHANES 2019-2020', 'NHANES 2017-2018', 'NHANES 2015-2016', 'NHANES 2013-2014', 'NHANES 2011-2012', 'NHANES 2009-2010', 'NHANES 2007-2008', 'NHANES 2005-2006', 'NHANES 2003-2004', 'NHANES 2001-2002', 'NHANES 1999-2000']


Crawling NHANES cycles:   0%|          | 0/13 [00:00<?, ?it/s]


🔹 Scanning cycle: NHANES 08/2021-08/2023
   [Found] Demographic Variables and Sample Weights (2021-2023)
   [Added] Demographics dataset for NHANES 08/2021-08/2023
   [Found] Dietary Interview - Individual Foods, First Day (2021-2023)
   [Found] Dietary Interview - Individual Foods, Second Day (2021-2023)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2021-2023)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2021-2023)
   [Found] Dietary Interview Technical Support File - Food Codes (2021-2023)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2021-2023)
   [Found] Dietary Supplement Use 30-Day - Total Dietary Supplements (2021-2023)
   [Found] Balance (2021-2023)
   [Found] Blood Pressure - Oscillometric Measurements (2021-2023)
   [Found] Body Measures (2021-2023)
   [Found] Liver Ultrasound Transient Elastography (2021-2023)
   [Found] Albumin & Creatinine - Urine (2021-2023)
   [Found] alpha-1-Acid Glycoprotein (2021-

Crawling NHANES cycles:   8%|▊         | 1/13 [00:08<01:45,  8.83s/it]


🔹 Scanning cycle: NHANES 2017-March 2020
   [Found] Demographic Variables and Sample Weights (2017-2018)
   [Found] Demographic Variables and Sample Weights (2017-2020)
   [Added] Demographics dataset for NHANES 2017-March 2020
   [Found] Dietary Interview - Individual Foods, First Day (2017-2018)
   [Found] Dietary Interview - Individual Foods, First Day (2017-2020)
   [Found] Dietary Interview - Individual Foods, Second Day (2017-2018)
   [Found] Dietary Interview - Individual Foods, Second Day (2017-2020)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2017-2020)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2017-2020)
   [Found] Dietary Interview Technical Support File - Food Codes (2017-2018)
   [Found] Dietary Interview Technical Support File - Food Codes (2017-2020)
   [Found] Dietary Su

Crawling NHANES cycles:  15%|█▌        | 2/13 [00:17<01:35,  8.67s/it]


🔹 Scanning cycle: NHANES 2019-2020


Crawling NHANES cycles:  23%|██▎       | 3/13 [00:25<01:24,  8.46s/it]


🔹 Scanning cycle: NHANES 2017-2018
   [Found] Demographic Variables and Sample Weights (2017-2018)
   [Added] Demographics dataset for NHANES 2017-2018
   [Found] Dietary Interview - Individual Foods, First Day (2017-2018)
   [Found] Dietary Interview - Individual Foods, Second Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2017-2018)
   [Found] Dietary Interview Technical Support File - Food Codes (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2017-2018)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2017-2018)
  

Crawling NHANES cycles:  31%|███       | 4/13 [00:33<01:14,  8.31s/it]


🔹 Scanning cycle: NHANES 2015-2016
   [Found] Demographic Variables and Sample Weights (2015-2016)
   [Added] Demographics dataset for NHANES 2015-2016
   [Found] Dietary Interview - Individual Foods, First Day (2015-2016)
   [Found] Dietary Interview - Individual Foods, Second Day (2015-2016)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2015-2016)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2015-2016)
   [Found] Dietary Interview Technical Support File - Food Codes (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2015-2016)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2015-2016)
  

Crawling NHANES cycles:  38%|███▊      | 5/13 [00:41<01:05,  8.24s/it]


🔹 Scanning cycle: NHANES 2013-2014
   [Found] Demographic Variables and Sample Weights (2013-2014)
   [Added] Demographics dataset for NHANES 2013-2014
   [Found] Dietary Interview - Individual Foods, First Day (2013-2014)
   [Found] Dietary Interview - Individual Foods, Second Day (2013-2014)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2013-2014)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2013-2014)
   [Found] Dietary Interview Technical Support File - Food Codes (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2013-2014)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2013-2014)
  

Crawling NHANES cycles:  46%|████▌     | 6/13 [00:50<00:57,  8.28s/it]


🔹 Scanning cycle: NHANES 2011-2012
   [Found] Demographic Variables & Sample Weights (2011-2012)
   [Added] Demographics dataset for NHANES 2011-2012
   [Found] Dietary Interview - Individual Foods, First Day (2011-2012)
   [Found] Dietary Interview - Individual Foods, Second Day (2011-2012)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2011-2012)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2011-2012)
   [Found] Dietary Interview Technical Support File - Food Codes (2011-2012)
   [Found] Dietary Interview Technical Support File - Modification Codes (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2011-2012)
   [F

Crawling NHANES cycles:  54%|█████▍    | 7/13 [01:00<00:54,  9.01s/it]


🔹 Scanning cycle: NHANES 2009-2010
   [Found] Demographic Variables & Sample Weights (2009-2010)
   [Added] Demographics dataset for NHANES 2009-2010
   [Found] Dietary Interview - Individual Foods, First Day (2009-2010)
   [Found] Dietary Interview - Individual Foods, Second Day (2009-2010)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2009-2010)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2009-2010)
   [Found] Dietary Interview Technical Support File - Food Codes (2009-2010)
   [Found] Dietary Interview Technical Support File - Modification Codes (2009-2010)
   [Found] Dietary Screener Questionnaire (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - T

Crawling NHANES cycles:  62%|██████▏   | 8/13 [01:11<00:48,  9.65s/it]


🔹 Scanning cycle: NHANES 2007-2008
   [Found] Demographic Variables & Sample Weights (2007-2008)
   [Added] Demographics dataset for NHANES 2007-2008
   [Found] Dietary Interview - Individual Foods, First Day (2007-2008)
   [Found] Dietary Interview - Individual Foods, Second Day (2007-2008)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2007-2008)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2007-2008)
   [Found] Dietary Interview Technical Support File - Food Codes (2007-2008)
   [Found] Dietary Interview Technical Support File - Modification Codes (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2007-2008)
   [F

Crawling NHANES cycles:  69%|██████▉   | 9/13 [01:20<00:37,  9.33s/it]


🔹 Scanning cycle: NHANES 2005-2006
   [Found] Demographic Variables & Sample Weights (2005-2006)
   [Added] Demographics dataset for NHANES 2005-2006
   [Found] Dietary Interview - Individual Foods, First Day (2005-2006)
   [Found] Dietary Interview - Individual Foods, Second Day (2005-2006)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2005-2006)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2005-2006)
   [Found] Dietary Interview Technical Support File - Food Codes (2005-2006)
   [Found] Dietary Interview Technical Support File - Modification Codes (2005-2006)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (2005-2006)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (2005-2006)
   [Found] Food Frequency Questionnaire - Look-Up Table FOODLOOK (2005-2006)
   [Found] Food Frequency Questionnaire - Look-Up Table VARLOOK (2005-2006)
   [Found] Food Frequency Questionnaire - Output from DietC

Crawling NHANES cycles:  77%|███████▋  | 10/13 [01:29<00:27,  9.21s/it]


🔹 Scanning cycle: NHANES 2003-2004
   [Found] Demographic Variables & Sample Weights (2003-2004)
   [Added] Demographics dataset for NHANES 2003-2004
   [Found] Dietary Interview - Individual Foods, First Day (2003-2004)
   [Found] Dietary Interview - Individual Foods, Second Day (2003-2004)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2003-2004)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2003-2004)
   [Found] Dietary Interview Technical Support File - Food Codes (2003-2004)
   [Found] Dietary Interview Technical Support File - Modification Codes (2003-2004)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (2003-2004)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (2003-2004)
   [Found] Food Frequency Questionnaire - Look-Up Table FOODLOOK (2003-2004)
   [Found] Food Frequency Questionnaire - Look-Up Table VARLOOK (2003-2004)
   [Found] Food Frequency Questionnaire - Output from DietC

Crawling NHANES cycles:  85%|████████▍ | 11/13 [01:38<00:18,  9.12s/it]


🔹 Scanning cycle: NHANES 2001-2002
   [Found] Demographic Variables & Sample Weights (2001-2002)
   [Added] Demographics dataset for NHANES 2001-2002
   [Found] Dietary Interview - Individual Foods (2001-2002)
   [Found] Dietary Interview - Total Nutrient Intakes (2001-2002)
   [Found] Dietary Interview Technical Support File - Food Code Format File (2001-2002)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (2001-2002)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (2001-2002)
   [Found] Audiometry (2001-2002)
   [Found] Audiometry - Acoustic Reflex (2001-2002)
   [Found] Audiometry - Tympanometry (2001-2002)
   [Found] Balance (2001-2002)
   [Found] Bioelectrical Impedance Analysis (2001-2002)
   [Found] Blood Pressure (2001-2002)
   [Found] Body Measures (2001-2002)
   [Found] Cardiovascular Fitness (2001-2002)
   [Found] Dual-Energy X-ray Absorptiometry - Whole Body, Second Exam (2001-2002)
   [Found] Lower Extremity Diseas

Crawling NHANES cycles:  92%|█████████▏| 12/13 [01:46<00:08,  8.91s/it]


🔹 Scanning cycle: NHANES 1999-2000
   [Found] Demographic Variables & Sample Weights (1999-2000)
   [Added] Demographics dataset for NHANES 1999-2000
   [Found] Dietary Interview - Individual Foods (1999-2000)
   [Found] Dietary Interview - Total Nutrient Intakes (1999-2000)
   [Found] Dietary Interview Technical Support File - Food Code Format File (1999-2000)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (1999-2000)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (1999-2000)
   [Found] Audiometry (1999-2000)
   [Found] Audiometry - Acoustic Reflex (1999-2000)
   [Found] Audiometry - Tympanometry (1999-2000)
   [Found] Balance (1999-2000)
   [Found] Bioelectrical Impedance Analysis (1999-2000)
   [Found] Blood Pressure (1999-2000)
   [Found] Body Measures (1999-2000)
   [Found] Cardiovascular Fitness (1999-2000)
   [Found] Lower Extremity Disease - Ankle Brachial Blood Pressure Index (1999-2000)
   [Found] Lower Extremity Dis

Crawling NHANES cycles: 100%|██████████| 13/13 [01:54<00:00,  8.84s/it]


✅ Crawling complete! Saved 1716 records to nhanes_catalog.json and nhanes_catalog.csv.





In [2]:
# =============================================================================
# CHUNK 2: AI FILTER - SEMANTIC & TF-IDF FILTERING
# =============================================================================
"""
Purpose: Narrow down 1,716 datasets to diabetes-relevant ones using AI
Output: nhanes_ai_semantic_dynamic.csv (158 datasets)
Time: ~2-3 minutes
"""
import requests
!pip install kneed
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from kneed import KneeLocator
import numpy as np
import matplotlib.pyplot as plt

# Load the catalog from Chunk 1
df_catalog = pd.read_csv("nhanes_catalog.csv")

# Separate demographics (always keep) from other datasets (filter these)
df_demographics = df_catalog[df_catalog["component"] == "Demographics"].copy()
df_non_demographics = df_catalog[df_catalog["component"] != "Demographics"].copy()

print(f"🔹 Demographics datasets preserved: {len(df_demographics)}")
print(f"🔹 Non-demographics datasets to filter: {len(df_non_demographics)}")

# Step 1: Biomedical keyword expansion
SEED_KEYWORDS = [
    "diabetes", "prediabetes", "glucose", "fasting glucose", "hba1c",
    "hemoglobin a1c", "insulin", "c-peptide", "triglyceride", "cholesterol",
    "hdl", "ldl", "body mass index", "bmi", "waist", "weight", "obesity",
    "hypertension","body measures", "anthropometry", "height", "standing height"
]

BIOMED_SYNONYMS = {
    "diabetes": ["type 2 diabetes", "type 1 diabetes", "hyperglycemia", "impaired glucose tolerance", "diabetic", "glucose intolerance"],
    "prediabetes": ["impaired fasting glucose", "impaired glucose tolerance", "borderline diabetes"],
    "glucose": ["blood sugar", "serum glucose", "fasting glucose", "plasma glucose"],
    "hba1c": ["hemoglobin a1c", "glycated hemoglobin", "glycohemoglobin", "a1c"],
    "insulin": ["fasting insulin", "serum insulin", "insulin resistance", "c-peptide"],
    "cholesterol": ["hdl", "ldl", "triglyceride", "lipids", "total cholesterol"],
    "obesity": ["overweight", "adiposity", "body mass index", "bmi", "waist circumference"],
    "hypertension": ["high blood pressure", "systolic", "diastolic"],
    "bmi": ["body mass index", "obesity indicator"],
    "triglyceride": ["blood lipids", "fatty acids", "serum triglycerides"],
    "hdl": ["good cholesterol"],
    "ldl": ["bad cholesterol"],
}

def expand_biomedical_keywords(seed_keywords, synonym_map):
    """Expand seed keywords with biomedical synonyms"""
    expanded = set(seed_keywords)
    for kw in seed_keywords:
        if kw.lower() in synonym_map:
            expanded.update(synonym_map[kw.lower()])
    return list(expanded)

expanded_keywords = expand_biomedical_keywords(SEED_KEYWORDS, BIOMED_SYNONYMS)
print(f"✅ Biomedical expansion complete: {len(expanded_keywords)} keywords")

# Step 2: TF-IDF filtering with noise reduction
NOISE_WORDS = set([
    "exam", "second", "file", "test", "data", "sheet", "survey",
    "questionnaire", "study", "participant", "demographics", "sample",
    "component"
])

dataset_texts = (df_non_demographics["dataset_name"] + " " + df_non_demographics["years"]).tolist()

# First-pass TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf_matrix = vectorizer.fit_transform(dataset_texts + [" ".join(expanded_keywords)])
seed_vector = tfidf_matrix[-1].toarray().flatten()
vocab = vectorizer.get_feature_names_out()
top_indices = seed_vector.argsort()[-50:][::-1]
first_pass_candidates = [vocab[i] for i in top_indices if vocab[i].lower() not in NOISE_WORDS]
first_pass_candidates = list(dict.fromkeys(first_pass_candidates))

# Second-pass TF-IDF
second_pass_doc = " ".join(expanded_keywords + first_pass_candidates)
vectorizer2 = TfidfVectorizer(ngram_range=(1,1))
tfidf_matrix2 = vectorizer2.fit_transform(dataset_texts + [second_pass_doc])
second_vector = tfidf_matrix2[-1].toarray().flatten()
vocab2 = vectorizer2.get_feature_names_out()
top_indices2 = second_vector.argsort()[-50:][::-1]
second_pass_candidates = [vocab2[i] for i in top_indices2 if vocab2[i].lower() not in NOISE_WORDS]
second_pass_candidates = list(dict.fromkeys(second_pass_candidates))

# Combine and compute final relevance scores
final_keywords = list(dict.fromkeys(expanded_keywords + first_pass_candidates + second_pass_candidates))
keyword_doc = " ".join(expanded_keywords * 3 + first_pass_candidates + second_pass_candidates)
vectorizer_final = TfidfVectorizer(ngram_range=(1,1))
tfidf_final = vectorizer_final.fit_transform(dataset_texts + [keyword_doc])
dataset_matrix_final = tfidf_final[:-1]
keyword_vector_final = tfidf_final[-1]
similarities = cosine_similarity(dataset_matrix_final, keyword_vector_final)
df_non_demographics["relevance_score"] = similarities.flatten()

# Apply threshold filter
threshold = 0.05
df_ai_filtered = df_non_demographics[df_non_demographics["relevance_score"] >= threshold].sort_values(by="relevance_score", ascending=False)
print(f"\n✅ AI filter applied: {len(df_ai_filtered)} datasets selected (excluding demographics)")

# Step 3: Semantic refinement with sentence transformers
print("🧠 Computing sentence embeddings...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
dataset_texts_step2 = (df_ai_filtered["dataset_name"] + " " + df_ai_filtered["years"]).tolist()
embeddings = model.encode(dataset_texts_step2, batch_size=32, show_progress_bar=True)
keyword_embedding = model.encode(" ".join(final_keywords))

cosine_similarities = np.dot(embeddings, keyword_embedding) / (
    np.linalg.norm(embeddings, axis=1) * np.linalg.norm(keyword_embedding)
)
df_ai_filtered["semantic_score"] = cosine_similarities

# Dynamic threshold using KneeLocator
scores_sorted = np.sort(df_ai_filtered["semantic_score"].values)[::-1]
x = np.arange(len(scores_sorted))
y = scores_sorted
knee = KneeLocator(x, y, curve='convex', direction='decreasing')

if knee.knee is not None:
    dynamic_threshold = y[knee.knee]
else:
    dynamic_threshold = 0.0

df_dynamic_filtered = df_ai_filtered[df_ai_filtered["semantic_score"] >= dynamic_threshold]

# Ensure minimum dataset count
min_datasets = 145
if len(df_dynamic_filtered) < min_datasets:
    df_dynamic_filtered = df_ai_filtered.sort_values(by="semantic_score", ascending=False).iloc[:min_datasets]

df_dynamic_filtered = df_dynamic_filtered.sort_values(by="semantic_score", ascending=False)

# Merge demographics back in
df_final = pd.concat([df_dynamic_filtered, df_demographics], ignore_index=True)
df_final.to_csv("nhanes_ai_semantic_dynamic.csv", index=False)
print(f"✅ Dynamic semantic refinement complete: {len(df_final)} datasets saved → nhanes_ai_semantic_dynamic.csv")

Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl.metadata (5.5 kB)
Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5
🔹 Demographics datasets preserved: 13
🔹 Non-demographics datasets to filter: 1703
✅ Biomedical expansion complete: 53 keywords

✅ AI filter applied: 197 datasets selected (excluding demographics)
🧠 Computing sentence embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

✅ Dynamic semantic refinement complete: 158 datasets saved → nhanes_ai_semantic_dynamic.csv


In [3]:
# =============================================================================
# CHUNK 2.1: ADD ALL DEMOGRAPHICS TO FILTERED CATALOG
# =============================================================================
# Purpose: Ensure ALL demographics datasets are included, not just AI-filtered ones
# Input:  nhanes_catalog.csv (from Chunk 1)
#         nhanes_ai_semantic_dynamic.csv (from Chunk 2)
# Output: nhanes_ai_semantic_dynamic.csv (updated with all demographics)
# =============================================================================

import pandas as pd

print("="*70)
print("🚀 CHUNK 2.1: DEMOGRAPHICS INTEGRATION")
print("="*70)
print("Purpose: Add ALL demographics datasets to filtered catalog")
print("="*70 + "\n")

# Load original catalog (all datasets)
df_catalog = pd.read_csv("nhanes_catalog.csv")

# Load AI-filtered catalog
df_filtered = pd.read_csv("nhanes_ai_semantic_dynamic.csv")

print(f"📊 Current filtered catalog: {len(df_filtered)} datasets")

# Get ALL demographics datasets from original catalog
df_demographics_all = df_catalog[df_catalog['component'] == 'Demographics'].copy()

print(f"📊 Total demographics datasets available: {len(df_demographics_all)}")

# Check how many demographics are already in filtered catalog
demographics_in_filtered = df_filtered[df_filtered['component'] == 'Demographics']
print(f"📊 Demographics already in filtered catalog: {len(demographics_in_filtered)}")

# Find demographics that are MISSING from filtered catalog
demographics_cycles_in_filtered = set(demographics_in_filtered['cycle'].unique())
demographics_cycles_all = set(df_demographics_all['cycle'].unique())

missing_cycles = demographics_cycles_all - demographics_cycles_in_filtered

print(f"\n⚠️  Missing demographics from {len(missing_cycles)} cycles:")
for cycle in sorted(missing_cycles):
    print(f"   - {cycle}")

# Get the missing demographics datasets
df_demographics_missing = df_demographics_all[
    df_demographics_all['cycle'].isin(missing_cycles)
].copy()

# Add placeholder scores for demographics (they're mandatory regardless of score)
df_demographics_missing['relevance_score'] = 1.0
df_demographics_missing['semantic_score'] = 1.0

# Combine: Remove old demographics, add ALL demographics
df_filtered_no_demographics = df_filtered[df_filtered['component'] != 'Demographics']

df_final = pd.concat([
    df_filtered_no_demographics,
    df_demographics_all  # ALL demographics, not just filtered ones
], ignore_index=True)

# Sort by cycle and component for readability
df_final = df_final.sort_values(['cycle', 'component', 'dataset_name']).reset_index(drop=True)

# Add scores to demographics if missing
df_final.loc[df_final['component'] == 'Demographics', 'relevance_score'] = df_final.loc[
    df_final['component'] == 'Demographics', 'relevance_score'
].fillna(1.0)

df_final.loc[df_final['component'] == 'Demographics', 'semantic_score'] = df_final.loc[
    df_final['component'] == 'Demographics', 'semantic_score'
].fillna(1.0)

# Save updated catalog
df_final.to_csv("nhanes_ai_semantic_dynamic.csv", index=False)

print(f"\n✅ Updated filtered catalog: {len(df_final)} datasets")
print(f"   Demographics: {len(df_final[df_final['component'] == 'Demographics'])}")
print(f"   Other filtered: {len(df_final[df_final['component'] != 'Demographics'])}")

# Show breakdown by component
print("\n📋 Final catalog breakdown by component:")
component_counts = df_final['component'].value_counts()
for component, count in component_counts.items():
    print(f"   {component}: {count}")

# Show cycles with demographics
demographics_cycles = sorted(df_final[df_final['component'] == 'Demographics']['cycle'].unique())
print(f"\n📋 Cycles with demographics data: {len(demographics_cycles)}")
for cycle in demographics_cycles:
    demog_count = len(df_final[(df_final['cycle'] == cycle) & (df_final['component'] == 'Demographics')])
    print(f"   {cycle}: {demog_count} demographics dataset(s)")

print("\n" + "="*70)
print("✅ CHUNK 2.1 COMPLETE!")
print("="*70)
print(f"📁 Output: nhanes_ai_semantic_dynamic.csv (updated)")
print("="*70)

🚀 CHUNK 2.1: DEMOGRAPHICS INTEGRATION
Purpose: Add ALL demographics datasets to filtered catalog

📊 Current filtered catalog: 158 datasets
📊 Total demographics datasets available: 13
📊 Demographics already in filtered catalog: 13

⚠️  Missing demographics from 0 cycles:

✅ Updated filtered catalog: 158 datasets
   Demographics: 13
   Other filtered: 145

📋 Final catalog breakdown by component:
   Laboratory: 99
   Questionnaire: 26
   Examination: 20
   Demographics: 13

📋 Cycles with demographics data: 12
   NHANES 08/2021-08/2023: 1 demographics dataset(s)
   NHANES 1999-2000: 1 demographics dataset(s)
   NHANES 2001-2002: 1 demographics dataset(s)
   NHANES 2003-2004: 1 demographics dataset(s)
   NHANES 2005-2006: 1 demographics dataset(s)
   NHANES 2007-2008: 1 demographics dataset(s)
   NHANES 2009-2010: 1 demographics dataset(s)
   NHANES 2011-2012: 1 demographics dataset(s)
   NHANES 2013-2014: 1 demographics dataset(s)
   NHANES 2015-2016: 1 demographics dataset(s)
   NHANES 20

In [4]:
# =============================================================================
# CHUNK 2.2: DEDUPLICATE FILTERED CATALOG
# =============================================================================
# Purpose: Remove duplicate entries from Chunk 2 output
# Input:  nhanes_ai_semantic_dynamic.csv (from Chunk 2)
# Output: nhanes_ai_semantic_dynamic.csv (cleaned, deduplicated)
# =============================================================================

import pandas as pd

print("="*70)
print("🚀 CHUNK 2.1: DEDUPLICATE FILTERED CATALOG")
print("="*70)
print("Purpose: Remove duplicate entries before downloading")
print("="*70 + "\n")

# Load the filtered catalog
df_filtered = pd.read_csv("nhanes_ai_semantic_dynamic.csv")

print(f"📊 Original filtered catalog:")
print(f"   Total entries: {len(df_filtered)}")

# Identify duplicates
duplicates = df_filtered[df_filtered.duplicated(
    subset=['cycle', 'component', 'dataset_name'],
    keep=False
)]

if len(duplicates) > 0:
    print(f"\n⚠️  Found {len(duplicates)} duplicate entries!\n")

    # Show duplicates by cycle
    duplicate_cycles = duplicates.groupby('cycle').size().sort_values(ascending=False)
    print("📋 Duplicates by cycle:")
    for cycle, count in duplicate_cycles.items():
        print(f"   {cycle}: {count} duplicate entries")

    print("\n📋 Duplicate datasets:")
    duplicate_datasets = duplicates[['cycle', 'component', 'dataset_name']].drop_duplicates()
    for _, row in duplicate_datasets.iterrows():
        print(f"   - {row['cycle']} | {row['component']} | {row['dataset_name']}")

    # Remove duplicates - keep first occurrence (usually has better scores)
    print(f"\n🔧 Removing duplicates (keeping first occurrence)...")
    df_cleaned = df_filtered.drop_duplicates(
        subset=['cycle', 'component', 'dataset_name'],
        keep='first'
    ).reset_index(drop=True)

    print(f"\n✅ Cleaned catalog:")
    print(f"   Total entries: {len(df_cleaned)}")
    print(f"   Removed: {len(df_filtered) - len(df_cleaned)} duplicates")

    # Verify by component
    print(f"\n📊 Breakdown by component:")
    print(f"   Before → After")
    for comp in ['Demographics', 'Laboratory', 'Questionnaire', 'Examination', 'Dietary']:
        before = len(df_filtered[df_filtered['component'] == comp])
        after = len(df_cleaned[df_cleaned['component'] == comp])
        diff = before - after
        if diff > 0:
            print(f"   {comp}: {before} → {after} (-{diff})")
        else:
            print(f"   {comp}: {before} → {after}")

    # Save cleaned catalog (overwrite original)
    df_cleaned.to_csv("nhanes_ai_semantic_dynamic.csv", index=False)

    print(f"\n💾 Saved cleaned catalog:")
    print(f"   File: nhanes_ai_semantic_dynamic.csv")
    print(f"   Entries: {len(df_cleaned)}")

else:
    print("✅ No duplicates found!")
    print("   Catalog is already clean.")
    df_cleaned = df_filtered

# Final summary
print("\n" + "="*70)
print("✅ CHUNK 2.1 COMPLETE!")
print("="*70)
print(f"📊 Final catalog stats:")
print(f"   Total unique datasets: {len(df_cleaned)}")
print(f"   Demographics: {len(df_cleaned[df_cleaned['component'] == 'Demographics'])}")
print(f"   Laboratory: {len(df_cleaned[df_cleaned['component'] == 'Laboratory'])}")
print(f"   Questionnaire: {len(df_cleaned[df_cleaned['component'] == 'Questionnaire'])}")
print(f"   Examination: {len(df_cleaned[df_cleaned['component'] == 'Examination'])}")
print(f"   Dietary: {len(df_cleaned[df_cleaned['component'] == 'Dietary'])}")

print(f"\n📋 Cycles covered:")
cycles = sorted(df_cleaned['cycle'].unique())
print(f"   {', '.join(cycles)}")

print("\n✅ Ready for Chunk 3 (downloading)")
print("="*70)

🚀 CHUNK 2.1: DEDUPLICATE FILTERED CATALOG
Purpose: Remove duplicate entries before downloading

📊 Original filtered catalog:
   Total entries: 158

⚠️  Found 22 duplicate entries!

📋 Duplicates by cycle:
   NHANES 2017-March 2020: 22 duplicate entries

📋 Duplicate datasets:
   - NHANES 2017-March 2020 | Demographics | Demographic Variables and Sample Weights
   - NHANES 2017-March 2020 | Examination | Body Measures
   - NHANES 2017-March 2020 | Laboratory | Cholesterol - High - Density Lipoprotein (HDL)
   - NHANES 2017-March 2020 | Laboratory | Cholesterol - Low-Density Lipoproteins (LDL) & Triglycerides
   - NHANES 2017-March 2020 | Laboratory | Cholesterol - Total
   - NHANES 2017-March 2020 | Laboratory | Fasting Questionnaire
   - NHANES 2017-March 2020 | Laboratory | Glycohemoglobin
   - NHANES 2017-March 2020 | Laboratory | Insulin
   - NHANES 2017-March 2020 | Laboratory | Plasma Fasting Glucose
   - NHANES 2017-March 2020 | Questionnaire | Blood Pressure & Cholesterol
   - NHA

In [5]:
# =============================================================================
# CHUNK 3: XPT FILE DOWNLOADER (FULLY OPTIMIZED)
# =============================================================================
# 🚀 OPTIMIZATIONS:
#   - Parallel downloads (10 workers)
#   - Exponential backoff retry logic
#   - Automatic resume (skips existing files)
#   - Progress tracking with statistics
#   - Robust error handling
#
# ⚡ EXPECTED TIME: 3-6 minutes (vs 15-30 minutes sequential)
# =============================================================================

import os
!pip install pyreadstat
import requests
import pyreadstat
import pandas as pd
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

print("="*70)
print("🚀 CHUNK 3: XPT FILE DOWNLOADER (OPTIMIZED)")
print("="*70)
print("⚡ Parallel downloads with 10 workers")
print("⚡ Smart retry with exponential backoff")
print("⚡ Automatic resume capability")
print("="*70 + "\n")

# =============================================================================
# CONFIGURATION
# =============================================================================

OUTPUT_DIR = "nhanes_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MAX_RETRIES = 5
INITIAL_RETRY_DELAY = 2  # Doubles each retry
MAX_WORKERS = 10
REQUEST_TIMEOUT = 30

# Thread-safe statistics
stats_lock = Lock()
stats = {
    'successful': 0,
    'skipped': 0,
    'failed': 0
}

# =============================================================================
# FUNCTIONS
# =============================================================================

def update_stats(category):
    """Thread-safe stats update"""
    with stats_lock:
        stats[category] += 1

def download_and_convert_xpt(row):
    """
    Download a single XPT file and convert to CSV
    Returns: (success: bool, filename: str, message: str)
    """
    dataset_name = row["dataset_name"]
    xpt_url = row["data_url"]

    # Validate URL
    if not isinstance(xpt_url, str) or not xpt_url.strip():
        update_stats('failed')
        return (False, dataset_name, "No URL provided")

    # Clean URL
    xpt_url = xpt_url.strip().replace("\n", "").replace("\r", "")
    if xpt_url.startswith("/"):
        xpt_url = "https://wwwn.cdc.gov" + xpt_url

    # Create unique filename
    cycle = row.get("cycle", "UnknownCycle")
    component = row.get("component", "UnknownComponent")
    filename = f"{cycle}_{component}_{dataset_name}".replace("/", "_").replace(" ", "_") + ".csv"
    filepath = os.path.join(OUTPUT_DIR, filename)

    # Skip if already exists
    if os.path.exists(filepath):
        update_stats('skipped')
        return (True, filename, "Already exists")

    # Download with exponential backoff retry
    retry_delay = INITIAL_RETRY_DELAY

    for attempt in range(1, MAX_RETRIES + 1):
        temp_xpt = f"temp_{os.getpid()}_{time.time()}.xpt"  # Unique temp file per thread

        try:
            # Download XPT file
            resp = requests.get(xpt_url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            resp.raise_for_status()

            # Save temporarily
            with open(temp_xpt, "wb") as f:
                f.write(resp.content)

            # Convert XPT to DataFrame
            df_xpt, meta = pyreadstat.read_xport(temp_xpt, encoding="latin1")

            # Add metadata columns
            df_xpt["NHANES_Cycle"] = cycle
            df_xpt["Component"] = component
            df_xpt["Dataset_Name"] = dataset_name
            df_xpt["Doc_URL"] = row.get("doc_url", "")

            # Save as CSV
            df_xpt.to_csv(filepath, index=False)

            # Cleanup temp file
            if os.path.exists(temp_xpt):
                os.remove(temp_xpt)

            update_stats('successful')
            return (True, filename, f"Downloaded (attempt {attempt})")

        except Exception as e:
            # Cleanup temp file on error
            if os.path.exists(temp_xpt):
                try:
                    os.remove(temp_xpt)
                except:
                    pass

            if attempt < MAX_RETRIES:
                # Wait with exponential backoff before retry
                time.sleep(retry_delay)
                retry_delay *= 2  # Double the delay
            else:
                # All attempts failed
                update_stats('failed')
                error_msg = str(e)[:80]
                return (False, filename, f"Failed after {MAX_RETRIES} attempts: {error_msg}")

    # Should never reach here
    update_stats('failed')
    return (False, filename, "Unknown error")

# =============================================================================
# MAIN EXECUTION
# =============================================================================

# Load filtered dataset list from Chunk 2
try:
    df_filtered = pd.read_csv("nhanes_ai_semantic_dynamic.csv")
except FileNotFoundError:
    print("❌ ERROR: nhanes_ai_semantic_dynamic.csv not found!")
    print("   Make sure you ran Chunk 2 first.")
    raise

total_files = len(df_filtered)

print(f"📥 Preparing to download {total_files} datasets")
print(f"⚙️  Using {MAX_WORKERS} parallel workers\n")

start_time = time.time()

# Track failed files
failed_files = []

# Parallel download with thread pool
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all download tasks
    future_to_row = {
        executor.submit(download_and_convert_xpt, row): row
        for _, row in df_filtered.iterrows()
    }

    # Process completed downloads with progress bar
    with tqdm(total=total_files, desc="Downloading datasets", unit="file") as pbar:
        for future in as_completed(future_to_row):
            success, filename, message = future.result()

            if not success:
                failed_files.append((filename, message))

            pbar.update(1)

            # Update progress bar postfix with current stats
            pbar.set_postfix({
                'OK': stats['successful'],
                'Skip': stats['skipped'],
                'Fail': stats['failed']
            })

elapsed_time = time.time() - start_time

# =============================================================================
# FINAL REPORT
# =============================================================================

print("\n" + "="*70)
print("✅ CHUNK 3 COMPLETE (OPTIMIZED)!")
print("="*70)
print(f"⏱️  Total time: {elapsed_time/60:.2f} minutes")
print(f"\n📊 Download Summary:")
print(f"   ✅ Successfully downloaded: {stats['successful']}")
print(f"   ⏭️  Skipped (already existed): {stats['skipped']}")
print(f"   ❌ Failed: {stats['failed']}")
print(f"   📁 Total files in {OUTPUT_DIR}: {len([f for f in os.listdir(OUTPUT_DIR) if f.endswith('.csv')])}")

# Show failed files if any
if failed_files:
    print(f"\n⚠️  Failed downloads ({len(failed_files)}):")
    for filename, message in failed_files[:10]:  # Show first 10
        print(f"   - {filename}")
        print(f"     {message}")
    if len(failed_files) > 10:
        print(f"   ... and {len(failed_files) - 10} more")

# Performance comparison
print(f"\n💡 Performance improvement:")
print(f"   Sequential method: ~15-30 minutes")
print(f"   Optimized method: {elapsed_time/60:.1f} minutes")
if elapsed_time > 0:
    speedup = 20 / (elapsed_time/60)
    print(f"   Speedup: ~{speedup:.1f}x faster!")

print("="*70)

# Next steps
print("\n📋 Next Steps:")
print("   1. Run the diagnostic script to verify downloads")
print("   2. Check for LBXIN and other key variables")
print("   3. If data looks good, proceed to Chunk 7")

print("\n💡 TIP: You can re-run this cell to retry failed downloads.")
print("   Already downloaded files will be skipped automatically.")

Collecting pyreadstat
  Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.2 kB)
Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (666 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m666.4/666.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.3.1
🚀 CHUNK 3: XPT FILE DOWNLOADER (OPTIMIZED)
⚡ Parallel downloads with 10 workers
⚡ Smart retry with exponential backoff
⚡ Automatic resume capability

📥 Preparing to download 147 datasets
⚙️  Using 10 parallel workers



Downloading datasets: 100%|██████████| 147/147 [00:43<00:00,  3.36file/s, OK=147, Skip=0, Fail=0]


✅ CHUNK 3 COMPLETE (OPTIMIZED)!
⏱️  Total time: 0.73 minutes

📊 Download Summary:
   ✅ Successfully downloaded: 147
   ⏭️  Skipped (already existed): 0
   ❌ Failed: 0
   📁 Total files in nhanes_data: 147

💡 Performance improvement:
   Sequential method: ~15-30 minutes
   Optimized method: 0.7 minutes
   Speedup: ~27.4x faster!

📋 Next Steps:
   1. Run the diagnostic script to verify downloads
   2. Check for LBXIN and other key variables
   3. If data looks good, proceed to Chunk 7

💡 TIP: You can re-run this cell to retry failed downloads.
   Already downloaded files will be skipped automatically.





In [6]:
# =============================================================================
# CHUNK 7 ENHANCED: PATIENT FLATTENING WITH COMPONENT METADATA + NO DATA LOSS
# =============================================================================
# Improvements over original Chunk 7:
# 1. OUTER JOIN instead of LEFT JOIN → Zero data loss
# 2. Component prefixes in column names → Easy identification
# 3. Component mapping preserved in summary file
# =============================================================================

import pandas as pd
import os
import re
from tqdm import tqdm
import time

print("="*70)
print("🚀 CHUNK 7 ENHANCED: PATIENT FLATTENING + COMPONENT METADATA")
print("="*70)
print("✅ OUTER JOIN for zero data loss")
print("✅ Component prefixes: LAB_, QUEST_, EXAM_, DEMO_")
print("✅ Type-safe SEQN normalization")
print("="*70 + "\n")

DATA_DIR = "nhanes_data"
OUTPUT_FILE = "nhanes_patient_flattened_enhanced.csv"
SUMMARY_FILE = "nhanes_column_summary_enhanced.csv"
METADATA_COLS = ['Dataset_Name', 'Doc_URL', 'Component', 'NHANES_Cycle']

start_time = time.time()

# Component prefix mapping
COMPONENT_PREFIX = {
    'Demographics': 'DEMO',
    'Laboratory': 'LAB',
    'Questionnaire': 'QUEST',
    'Examination': 'EXAM',
    'Dietary': 'DIET'
}

def extract_cycle(filename):
    """Extract cycle information from filename"""
    match = re.search(r'(\d{4})[-_](\d{4})', filename)
    if match:
        return f"{match.group(1)}-{match.group(2)}"
    numbers = re.findall(r'\d{4}', filename)
    if len(numbers) >= 2:
        return f"{numbers[0]}-{numbers[1]}"
    return "unknown"

def extract_component_from_csv(filepath):
    """
    Extract component type from CSV metadata
    Looks for 'Component' column in the CSV
    """
    try:
        # Read just first few rows to check metadata
        df_sample = pd.read_csv(filepath, nrows=5, low_memory=False)
        if 'Component' in df_sample.columns:
            # Get the component value (should be same for all rows in file)
            component = df_sample['Component'].iloc[0]
            return component
    except:
        pass

    # Fallback: infer from filename
    filename = os.path.basename(filepath)
    if 'Demographics' in filename or 'DEMO' in filename:
        return 'Demographics'
    elif 'Laboratory' in filename or 'LAB' in filename:
        return 'Laboratory'
    elif 'Questionnaire' in filename or 'QUEST' in filename:
        return 'Questionnaire'
    elif 'Examination' in filename or 'EXAM' in filename:
        return 'Examination'
    elif 'Dietary' in filename or 'DIET' in filename:
        return 'Dietary'

    return 'Unknown'

def load_csv_robust(filepath):
    """Try multiple methods to load a CSV file"""
    # Method 1: Standard pandas
    try:
        return pd.read_csv(filepath, low_memory=False)
    except:
        pass

    # Method 2: Latin-1 encoding
    try:
        return pd.read_csv(filepath, encoding='latin1', low_memory=False)
    except:
        pass

    # Method 3: UTF-16 encoding
    try:
        return pd.read_csv(filepath, encoding='utf-16', low_memory=False)
    except:
        pass

    # Method 4: ISO-8859-1 encoding
    try:
        return pd.read_csv(filepath, encoding='iso-8859-1', low_memory=False)
    except:
        pass

    return None

def normalize_seqn(df):
    """Ensure SEQN is consistent integer type"""
    if 'SEQN' in df.columns:
        try:
            df['SEQN'] = pd.to_numeric(df['SEQN'], errors='coerce').astype('Int64')
        except:
            pass
    return df

# Step 1: Group files by cycle
print("📂 Step 1: Grouping files by cycle...")
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
cycle_files = {}

for file in csv_files:
    cycle = extract_cycle(file)
    cycle_files.setdefault(cycle, []).append(file)

print(f"✅ Found {len(cycle_files)} cycles with {len(csv_files)} files\n")

# Step 2: Process each cycle
print("🔄 Step 2: Merging datasets within each cycle...")
flattened_dfs = []
column_sources = []
missing_seqn_files = []
failed_cycles = []
component_stats = {'Demographics': 0, 'Laboratory': 0, 'Questionnaire': 0, 'Examination': 0, 'Dietary': 0, 'Unknown': 0}

for cycle in tqdm(sorted(cycle_files.keys()), desc="Processing cycles"):
    files = cycle_files[cycle]

    # Find demographics
    demog_file = [f for f in files if "Demographics" in f or "DEMO" in f]
    if not demog_file:
        print(f"⚠️ No demographics for {cycle}, skipping")
        failed_cycles.append(cycle)
        continue

    demog_path = os.path.join(DATA_DIR, demog_file[0])

    # Load demographics
    print(f"\n🔄 Loading demographics for {cycle}...")
    df_cycle = load_csv_robust(demog_path)

    if df_cycle is None:
        print(f"❌ Failed to load demographics for {cycle}")
        failed_cycles.append(cycle)
        continue

    # Get component type
    component_type = extract_component_from_csv(demog_path)
    component_prefix = COMPONENT_PREFIX.get(component_type, 'UNK')
    component_stats[component_type] = component_stats.get(component_type, 0) + 1

    # Normalize SEQN type
    df_cycle = normalize_seqn(df_cycle)

    print(f"✅ {cycle}: {len(df_cycle)} patients, {len(df_cycle.columns)} columns [{component_type}]")

    # Extract Doc_URL before dropping metadata
    doc_url = None
    if 'Doc_URL' in df_cycle.columns and not df_cycle['Doc_URL'].isna().all():
        doc_url = str(df_cycle['Doc_URL'].iloc[0])

    # Drop metadata columns and track sources
    df_cycle_clean = df_cycle.drop(columns=[c for c in METADATA_COLS if c in df_cycle.columns], errors='ignore')

    for col in df_cycle_clean.columns:
        if col != 'SEQN':
            column_sources.append({
                "Column": f"{component_prefix}_{col}",
                "Original_Column": col,
                "Component": component_type,
                "Cycle": cycle,
                "Source_File": demog_file[0],
                "Doc_URL": doc_url
            })

    # Rename columns with component prefix (except SEQN)
    rename_map = {col: f"{component_prefix}_{col}"
                  for col in df_cycle_clean.columns if col != 'SEQN'}
    df_cycle_clean = df_cycle_clean.rename(columns=rename_map)

    # Merge other datasets with OUTER JOIN
    merged_count = 0
    for f in files:
        if f == demog_file[0]:
            continue

        filepath = os.path.join(DATA_DIR, f)

        # Load file
        df_other = load_csv_robust(filepath)

        if df_other is None:
            print(f"   ⚠️ Failed to read {f}")
            continue

        # Check for SEQN
        if 'SEQN' not in df_other.columns:
            missing_seqn_files.append(f"{cycle}: {f}")
            continue

        # Get component type for this file
        other_component = extract_component_from_csv(filepath)
        other_prefix = COMPONENT_PREFIX.get(other_component, 'UNK')
        component_stats[other_component] = component_stats.get(other_component, 0) + 1

        # Extract Doc_URL before dropping metadata
        other_doc_url = None
        if 'Doc_URL' in df_other.columns and not df_other['Doc_URL'].isna().all():
            other_doc_url = str(df_other['Doc_URL'].iloc[0])

        # Normalize SEQN type
        df_other = normalize_seqn(df_other)

        # Drop metadata
        df_other_clean = df_other.drop(columns=[c for c in METADATA_COLS if c in df_other.columns], errors='ignore')

        # Rename columns with component prefix
        file_prefix = os.path.splitext(f)[0]
        rename_map = {col: f"{other_prefix}_{col}_{file_prefix}"
                     for col in df_other_clean.columns if col != 'SEQN'}
        df_other_clean = df_other_clean.rename(columns=rename_map)

        # Track columns
        for new_col in rename_map.values():
            original_col = [k for k, v in rename_map.items() if v == new_col][0]
            column_sources.append({
                "Column": new_col,
                "Original_Column": original_col,
                "Component": other_component,
                "Cycle": cycle,
                "Source_File": f,
                "Doc_URL": other_doc_url
            })

        # 🔑 KEY CHANGE: OUTER JOIN instead of LEFT JOIN
        try:
            df_cycle_clean = df_cycle_clean.merge(df_other_clean, on="SEQN", how="outer")
            merged_count += 1
        except Exception as e:
            print(f"   ⚠️ Failed to merge {f}: {str(e)[:80]}")

    print(f"   ✅ Merged {merged_count} datasets into {cycle}")
    flattened_dfs.append(df_cycle_clean)

# Combine all cycles
print("\n🔗 Step 3: Combining all cycles...")
all_patients_df = pd.concat(flattened_dfs, ignore_index=True, sort=False)

# Normalize SEQN and sort
print("📊 Step 4: Normalizing SEQN and sorting...")
all_patients_df = normalize_seqn(all_patients_df)

# Remove any rows with null SEQN
null_seqn_count = all_patients_df['SEQN'].isna().sum()
if null_seqn_count > 0:
    print(f"⚠️  Removing {null_seqn_count} rows with null SEQN")
    all_patients_df = all_patients_df[all_patients_df['SEQN'].notna()]

# Sort by SEQN
all_patients_df = all_patients_df.sort_values('SEQN').reset_index(drop=True)

# Save outputs
print("💾 Step 5: Saving outputs...")
all_patients_df.to_csv(OUTPUT_FILE, index=False)

df_summary = pd.DataFrame(column_sources)
df_summary.to_csv(SUMMARY_FILE, index=False)

elapsed_time = time.time() - start_time

print("\n" + "="*70)
print("✅ CHUNK 7 ENHANCED COMPLETE!")
print("="*70)
print(f"⏱️  Time: {elapsed_time/60:.2f} minutes")
print(f"\n📊 Statistics:")
print(f"   Total patients: {all_patients_df.shape[0]:,}")
print(f"   Total variables: {all_patients_df.shape[1]:,}")
print(f"   File size: {os.path.getsize(OUTPUT_FILE) / (1024**2):.2f} MB")

print(f"\n📋 Component breakdown:")
for comp, count in component_stats.items():
    if count > 0:
        # Count columns by component prefix
        comp_prefix = COMPONENT_PREFIX.get(comp, 'UNK')
        col_count = sum(1 for col in all_patients_df.columns if col.startswith(f"{comp_prefix}_"))
        print(f"   {comp}: {count} datasets, {col_count} columns")

print(f"\n📋 Cycles processed: {len(flattened_dfs)}/{len(cycle_files)}")

if failed_cycles:
    print(f"\n⚠️  Failed cycles ({len(failed_cycles)}):")
    for cycle in failed_cycles:
        print(f"   - {cycle}")

if missing_seqn_files:
    print(f"\n⚠️  Files without SEQN: {len(missing_seqn_files)}")

print(f"\n📁 Output files:")
print(f"   Main: {OUTPUT_FILE}")
print(f"   Summary: {SUMMARY_FILE}")
print("="*70)

print("\n💡 Column naming convention:")
print("   LAB_*      = Laboratory measurements")
print("   QUEST_*    = Questionnaire responses")
print("   EXAM_*     = Examination results")
print("   DEMO_*     = Demographics data")
print("   DIET_*     = Dietary data")
print("\nExample: LAB_LBXIN_BIOPRO_G = Insulin from Laboratory/BIOPRO_G dataset")



🚀 CHUNK 7 ENHANCED: PATIENT FLATTENING + COMPONENT METADATA
✅ OUTER JOIN for zero data loss
✅ Component prefixes: LAB_, QUEST_, EXAM_, DEMO_
✅ Type-safe SEQN normalization

📂 Step 1: Grouping files by cycle...
✅ Found 12 cycles with 147 files

🔄 Step 2: Merging datasets within each cycle...


Processing cycles:   0%|          | 0/12 [00:00<?, ?it/s]


🔄 Loading demographics for 1999-2000...
✅ 1999-2000: 9965 patients, 148 columns [Demographics]


Processing cycles:   8%|▊         | 1/12 [00:00<00:07,  1.38it/s]

   ✅ Merged 9 datasets into 1999-2000

🔄 Loading demographics for 2001-2002...
✅ 2001-2002: 11039 patients, 41 columns [Demographics]


Processing cycles:  17%|█▋        | 2/12 [00:01<00:05,  1.95it/s]

   ✅ Merged 10 datasets into 2001-2002

🔄 Loading demographics for 2003-2004...
✅ 2003-2004: 10122 patients, 48 columns [Demographics]


Processing cycles:  25%|██▌       | 3/12 [00:01<00:03,  2.26it/s]

   ✅ Merged 9 datasets into 2003-2004

🔄 Loading demographics for 2005-2006...
✅ 2005-2006: 10348 patients, 47 columns [Demographics]


Processing cycles:  33%|███▎      | 4/12 [00:01<00:03,  2.39it/s]

   ✅ Merged 10 datasets into 2005-2006

🔄 Loading demographics for 2007-2008...
✅ 2007-2008: 10149 patients, 47 columns [Demographics]


Processing cycles:  42%|████▏     | 5/12 [00:02<00:02,  2.34it/s]

   ✅ Merged 11 datasets into 2007-2008

🔄 Loading demographics for 2009-2010...
✅ 2009-2010: 10537 patients, 47 columns [Demographics]


Processing cycles:  50%|█████     | 6/12 [00:02<00:02,  2.35it/s]

   ✅ Merged 11 datasets into 2009-2010

🔄 Loading demographics for 2011-2012...
✅ 2011-2012: 9756 patients, 52 columns [Demographics]


Processing cycles:  58%|█████▊    | 7/12 [00:03<00:02,  2.15it/s]

   ✅ Merged 13 datasets into 2011-2012

🔄 Loading demographics for 2013-2014...
✅ 2013-2014: 10175 patients, 51 columns [Demographics]


Processing cycles:  67%|██████▋   | 8/12 [00:03<00:02,  1.98it/s]

   ✅ Merged 14 datasets into 2013-2014

🔄 Loading demographics for 2015-2016...
✅ 2015-2016: 9971 patients, 51 columns [Demographics]


Processing cycles:  75%|███████▌  | 9/12 [00:04<00:01,  2.04it/s]

   ✅ Merged 12 datasets into 2015-2016

🔄 Loading demographics for 2017-2018...
✅ 2017-2018: 9254 patients, 50 columns [Demographics]


Processing cycles:  83%|████████▎ | 10/12 [00:04<00:00,  2.14it/s]

   ✅ Merged 12 datasets into 2017-2018

🔄 Loading demographics for 2017-2020...
✅ 2017-2020: 9254 patients, 50 columns [Demographics]


Processing cycles:  92%|█████████▏| 11/12 [00:05<00:00,  1.91it/s]

   ✅ Merged 12 datasets into 2017-2020

🔄 Loading demographics for 2021-2023...
✅ 2021-2023: 11933 patients, 31 columns [Demographics]


Processing cycles: 100%|██████████| 12/12 [00:05<00:00,  2.07it/s]

   ✅ Merged 12 datasets into 2021-2023

🔗 Step 3: Combining all cycles...





📊 Step 4: Normalizing SEQN and sorting...
💾 Step 5: Saving outputs...

✅ CHUNK 7 ENHANCED COMPLETE!
⏱️  Time: 2.23 minutes

📊 Statistics:
   Total patients: 136,803
   Total variables: 1,981
   File size: 314.77 MB

📋 Component breakdown:
   Demographics: 12 datasets, 175 columns
   Laboratory: 92 datasets, 755 columns
   Questionnaire: 24 datasets, 629 columns
   Examination: 19 datasets, 421 columns

📋 Cycles processed: 12/12

📁 Output files:
   Main: nhanes_patient_flattened_enhanced.csv
   Summary: nhanes_column_summary_enhanced.csv

💡 Column naming convention:
   LAB_*      = Laboratory measurements
   QUEST_*    = Questionnaire responses
   EXAM_*     = Examination results
   DEMO_*     = Demographics data
   DIET_*     = Dietary data

Example: LAB_LBXIN_BIOPRO_G = Insulin from Laboratory/BIOPRO_G dataset


In [9]:
# =============================================================================
# CHUNK 9: FINAL - CORRECT CDC STRUCTURE PARSING
# =============================================================================
# Parses the ACTUAL CDC structure:
# - <div class="pagebreak"> contains each variable
# - <h3> has variable name
# - <dl> has variable details (Variable Name, SAS Label, English Text)
# - <table> has value codes
# =============================================================================

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

print("="*70)
print("🚀 CHUNK 9: FINAL - CORRECT CDC PARSING")
print("="*70)
print("⚡ Uses actual CDC HTML structure (pagebreak divs + definition lists)")
print()

# =============================================================================
# CONFIGURATION
# =============================================================================

FLATTENED_FILE = "nhanes_patient_flattened_enhanced.csv"
SUMMARY_FILE = "nhanes_column_summary_enhanced.csv"

DICTIONARY_OUTPUT = "nhanes_complete_dictionary.csv"
VALUE_CODES_OUTPUT = "nhanes_value_codes.csv"
CACHE_FILE = "nhanes_scrape_cache.json"

MAX_WORKERS = 15
URL_TIMEOUT = 45
MAX_RETRIES = 3
REQUEST_HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}

# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def extract_base_variable(col):
    """Extract base variable from component-prefixed column"""
    if col == 'SEQN':
        return 'SEQN'

    parts = col.split('_')

    if parts[0] in ['LAB', 'QUEST', 'EXAM', 'DEMO', 'DIET', 'UNK']:
        if len(parts) >= 2:
            return parts[1]
        else:
            return col
    else:
        return parts[0]

def load_cache():
    """Load cached scraping results"""
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, 'r') as f:
                return json.load(f)
        except:
            return {}
    return {}

def save_cache(cache):
    """Save scraping results to cache"""
    with open(CACHE_FILE, 'w') as f:
        json.dump(cache, f, indent=2)

def scrape_cdc_correct_structure(url, timeout=URL_TIMEOUT, retry_count=0):
    """
    CORRECT: Parse CDC's actual structure

    Structure:
    <div class="pagebreak">
      <h3>SEQN - Respondent sequence number</h3>
      <dl>
        <dt>Variable Name:</dt><dd>SEQN</dd>
        <dt>SAS Label:</dt><dd>Respondent sequence number</dd>
        <dt>English Text:</dt><dd>Full description here</dd>
      </dl>
      <table>
        <!-- value codes -->
      </table>
    </div>
    """
    try:
        response = requests.get(url, headers=REQUEST_HEADERS, timeout=timeout)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        variables = {}

        # Find all variable sections (pagebreak divs)
        variable_sections = soup.find_all('div', class_='pagebreak')

        for section in variable_sections:
            var_name = None
            description = ""
            codes = []

            # Method 1: Get variable name from <h3> heading
            h3 = section.find('h3')
            if h3:
                heading_text = h3.get_text(strip=True)
                # Format: "SEQN - Respondent sequence number"
                # Extract first word (the variable name)
                var_name = heading_text.split()[0] if heading_text else None

            # Method 2: Get variable details from definition list <dl>
            dl = section.find('dl')
            if dl:
                # Find all dt/dd pairs
                dts = dl.find_all('dt')
                dds = dl.find_all('dd')

                # Create a dict of label → value
                dl_dict = {}
                for dt, dd in zip(dts, dds):
                    label = dt.get_text(strip=True).rstrip(':')
                    value = dd.get_text(strip=True)
                    dl_dict[label] = value

                # Extract variable name (if not already found)
                if not var_name and 'Variable Name' in dl_dict:
                    var_name = dl_dict['Variable Name']

                # Extract description (prefer English Text, fallback to SAS Label)
                if 'English Text' in dl_dict:
                    description = dl_dict['English Text']
                elif 'SAS Label' in dl_dict:
                    description = dl_dict['SAS Label']

            # Method 3: Get value codes from table in this section
            table = section.find('table')
            if table:
                rows = table.find_all('tr')[1:]  # Skip header

                for row in rows:
                    cells = row.find_all('td')

                    if len(cells) >= 2:
                        code = cells[0].get_text(strip=True)
                        meaning = cells[1].get_text(strip=True)

                        if code and meaning:
                            codes.append({
                                'code': code,
                                'meaning': meaning
                            })

            # Store the variable if we found a name
            if var_name:
                var_name_clean = var_name.upper().strip()

                if var_name_clean and not var_name_clean.isdigit():
                    variables[var_name_clean] = {
                        'description': description if description else "No description available",
                        'codes': codes
                    }

        return variables

    except requests.exceptions.Timeout:
        if retry_count < MAX_RETRIES:
            time.sleep(2)
            return scrape_cdc_correct_structure(url, timeout, retry_count + 1)
        return {}
    except Exception as e:
        return {}

# =============================================================================
# MAIN EXECUTION
# =============================================================================

start_time = time.time()

# Step 1: Load variable list
print("Step 1: Loading needed variables...")

if not os.path.exists(FLATTENED_FILE):
    raise FileNotFoundError(f"❌ File not found: {FLATTENED_FILE}")

wide_df = pd.read_csv(FLATTENED_FILE, nrows=0)
needed_vars = set()

for col in wide_df.columns:
    base_var = extract_base_variable(col)
    needed_vars.add(base_var)

print(f"   → Need definitions for {len(needed_vars)} variables")
print(f"   → Sample: {list(needed_vars)[:10]}\n")

# Step 2: Load URLs
print("Step 2: Loading URLs from summary file...")

if not os.path.exists(SUMMARY_FILE):
    raise FileNotFoundError(f"❌ Summary file not found: {SUMMARY_FILE}")

df_summary = pd.read_csv(SUMMARY_FILE)

if 'Doc_URL' not in df_summary.columns:
    raise ValueError("❌ Summary file missing Doc_URL - re-run Enhanced Chunk 7")

doc_urls = set(df_summary['Doc_URL'].dropna().unique())
print(f"   → Found {len(doc_urls)} unique URLs\n")

# Step 3: Load cache
print("Step 3: Loading cache...")
cache = load_cache()

cached_count = len([url for url in doc_urls if url in cache])
print(f"   → Cache has {cached_count}/{len(doc_urls)} URLs\n")

# Step 4: Scrape with correct structure parsing
print("Step 4: Scraping with CORRECT structure parsing...")
print(f"   → Using {MAX_WORKERS} parallel workers")
print(f"   → Parsing <div class='pagebreak'> sections")
print(f"   → Extracting from <dl> definition lists")
print("-"*70)

found_variables = {}
all_value_codes = []
lock = Lock()

pbar = tqdm(total=len(needed_vars), desc="Finding variables", unit="var")

def process_url(url):
    """Process a single URL"""
    try:
        # Check cache
        if url in cache:
            result = cache[url]
        else:
            result = scrape_cdc_correct_structure(url)
            with lock:
                cache[url] = result

        # Update found variables
        with lock:
            for var_name, info in result.items():
                if var_name in needed_vars:
                    if var_name not in found_variables:
                        found_variables[var_name] = info
                        pbar.update(1)

                    # Extract value codes
                    for code_info in info.get('codes', []):
                        all_value_codes.append({
                            'variable': var_name,
                            'code': code_info['code'],
                            'meaning': code_info['meaning']
                        })

        return True
    except:
        return False

# Process URLs in parallel
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_url, url): url for url in doc_urls}

    for future in as_completed(futures):
        try:
            future.result()
        except:
            pass

pbar.close()

# Save cache
save_cache(cache)

scraping_time = time.time() - start_time

print()
print(f"✓ Scraping complete in {scraping_time/60:.1f} minutes!")
print(f"   Variables found: {len(found_variables)}/{len(needed_vars)}")
print(f"   Coverage: {len(found_variables)/len(needed_vars)*100:.1f}%")
print(f"   Variables with value codes: {sum(1 for v in found_variables.values() if v.get('codes'))}")
print(f"   Total value codes: {len(all_value_codes)}")
print(f"✓ Cache saved: {CACHE_FILE}\n")

# Step 5: Create data dictionary
print("Step 5: Creating data dictionary...")

dict_data = []

for col in wide_df.columns:
    base_var = extract_base_variable(col)

    if base_var in found_variables:
        var_info = found_variables[base_var]
        description = var_info['description']
        has_codes = 'Yes' if var_info.get('codes') else 'No'

        codes_str = '; '.join([
            f"{c['code']}={c['meaning']}"
            for c in var_info.get('codes', [])
        ])

        source = 'Scraped'
    else:
        description = 'No description available'
        has_codes = 'No'
        codes_str = ''
        source = 'Not Found'

    dict_data.append({
        'column': col,
        'base_variable': base_var,
        'description': description,
        'has_codes': has_codes,
        'value_codes': codes_str,
        'source': source
    })

dict_df = pd.DataFrame(dict_data)
dict_df.to_csv(DICTIONARY_OUTPUT, index=False)
print(f"✓ Dictionary saved: {DICTIONARY_OUTPUT}\n")

# Step 6: Create value codes reference
print("Step 6: Creating value codes reference...")

if all_value_codes:
    value_codes_df = pd.DataFrame(all_value_codes)
    value_codes_df = value_codes_df.drop_duplicates(subset=['variable', 'code'])
    value_codes_df = value_codes_df.sort_values(['variable', 'code'])
    value_codes_df.to_csv(VALUE_CODES_OUTPUT, index=False)
    print(f"✓ Value codes saved: {VALUE_CODES_OUTPUT}\n")
else:
    print("⚠ No value codes found\n")

# Step 7: Statistics
total_time = time.time() - start_time

print("="*70)
print("RESULTS SUMMARY")
print("="*70)
print(f"⏱️  Total time: {total_time/60:.1f} minutes")
print(f"📊 Total columns: {len(dict_df)}")
print(f"✓  Descriptions found: {len(dict_df[dict_df['source']=='Scraped'])}")
print(f"🏷️  Variables with codes: {len(dict_df[dict_df['has_codes']=='Yes'])}")
print(f"📝 Total value codes: {len(all_value_codes)}")
coverage = len(found_variables) / len(needed_vars) * 100
print(f"📈 Coverage: {coverage:.1f}%")
print()

# Show samples
print("Sample variables with descriptions and value codes:")
print("-"*70)

# Show variables WITH codes
samples_with_codes = dict_df[dict_df['has_codes'] == 'Yes'].head(8)

if not samples_with_codes.empty:
    print("\n✓ Categorical variables (with value codes):")
    for _, row in samples_with_codes.iterrows():
        desc = row['description'][:70] + "..." if len(row['description']) > 70 else row['description']
        print(f"\n  {row['base_variable']}: {desc}")
        if row['value_codes']:
            codes = row['value_codes'].split('; ')[:5]
            for code in codes:
                print(f"    • {code}")
            remaining = len(row['value_codes'].split('; ')) - 5
            if remaining > 0:
                print(f"    • ... and {remaining} more")

# Show variables WITHOUT codes
samples_without = dict_df[(dict_df['source'] == 'Scraped') & (dict_df['has_codes'] == 'No')].head(3)

if not samples_without.empty:
    print("\n✓ Continuous variables (no value codes):")
    for _, row in samples_without.iterrows():
        desc = row['description'][:70] + "..." if len(row['description']) > 70 else row['description']
        print(f"\n  {row['base_variable']}: {desc}")

print("\n" + "="*70)
print("✅ CHUNK 9 COMPLETE!")
print("="*70)
print(f"🎯 Final Coverage: {coverage:.1f}%")
print("="*70)

🚀 CHUNK 9: FINAL - CORRECT CDC PARSING
⚡ Uses actual CDC HTML structure (pagebreak divs + definition lists)

Step 1: Loading needed variables...
   → Need definitions for 578 variables
   → Sample: ['LBXSF3SI', 'WTIREP31', 'DIQ060U', 'SSEPA', 'BMALLEXT', 'DMDBORN2', 'DIQ175J', 'BPXSY2', 'PEASCTM1', 'BPXDI1']

Step 2: Loading URLs from summary file...
   → Found 137 unique URLs

Step 3: Loading cache...
   → Cache has 137/137 URLs

Step 4: Scraping with CORRECT structure parsing...
   → Using 15 parallel workers
   → Parsing <div class='pagebreak'> sections
   → Extracting from <dl> definition lists
----------------------------------------------------------------------


Finding variables:  96%|█████████▌| 554/578 [00:00<00:00, 35470.07var/s]


✓ Scraping complete in 0.0 minutes!
   Variables found: 554/578
   Coverage: 95.8%
   Variables with value codes: 553
   Total value codes: 7916
✓ Cache saved: nhanes_scrape_cache.json

Step 5: Creating data dictionary...
✓ Dictionary saved: nhanes_complete_dictionary.csv

Step 6: Creating value codes reference...
✓ Value codes saved: nhanes_value_codes.csv

RESULTS SUMMARY
⏱️  Total time: 0.0 minutes
📊 Total columns: 1981
✓  Descriptions found: 1957
🏷️  Variables with codes: 1956
📝 Total value codes: 7916
📈 Coverage: 95.8%

Sample variables with descriptions and value codes:
----------------------------------------------------------------------

✓ Categorical variables (with value codes):

  SDDSRVYR: Data release cycle
    • 7=NHANES 2011-2012 public release
    • .=Missing

  RIDSTATR: Interview and examination status of the participant.
    • 1=Interviewed only
    • 2=Both interviewed and MEC examined
    • .=Missing

  RIDEXMON: Six month time period when the examination was per




In [10]:
"""
COMPRESS AND DOWNLOAD NHANES OUTPUTS
====================================
Zips all NHANES extraction outputs with timestamp for easy download.

Includes:
- Patient flattened data
- Column summary
- Complete dictionary
- Value codes reference
- Raw data files (optional)
"""

import zipfile
import os
from datetime import datetime
from pathlib import Path

print("="*80)
print("📦 COMPRESSING NHANES DATA FOR DOWNLOAD")
print("="*80)
print()

# --- Configuration ---
INCLUDE_RAW_DATA = False  # Set to True if you want to include the nhanes_data folder (147 CSV files)

# Files to include (these are the main outputs from your pipeline)
OUTPUT_FILES = [
    'nhanes_patient_flattened_enhanced.csv',      # Main patient data (134,701 patients × 1,979 vars)
    'nhanes_column_summary_enhanced.csv',         # Column metadata and sources
    'nhanes_complete_dictionary.csv',             # Variable descriptions and value codes
    'nhanes_value_codes.csv',                     # Value code mappings
    'nhanes_catalog.csv',                         # Original dataset catalog (from Chunk 1)
    'nhanes_ai_semantic_dynamic.csv',             # AI-filtered datasets (from Chunk 2)
]

# Directories
source_dir = '/content'           # Where your Colab files are
data_dir = '/content/nhanes_data' # Raw data folder (147 CSV files)
output_dir = '/content/outputs'   # Where to save the zip
os.makedirs(output_dir, exist_ok=True)

# --- Create timestamped ZIP filename ---
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
zip_name = f"nhanes_complete_outputs_{timestamp}.zip"
output_zip = os.path.join(output_dir, zip_name)

print(f"📂 Source directory: {source_dir}")
print(f"💾 Output file: {zip_name}")
print()

# --- Check which files exist ---
print("Checking for output files...")
existing_files = []
missing_files = []

for filename in OUTPUT_FILES:
    filepath = os.path.join(source_dir, filename)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        existing_files.append(filepath)
        print(f"  ✓ {filename} ({size_mb:.1f} MB)")
    else:
        missing_files.append(filename)
        print(f"  ⚠ {filename} (not found)")

if missing_files:
    print()
    print(f"⚠️  Warning: {len(missing_files)} files not found")
    print("    Make sure you've run all the NHANES extraction chunks!")
    print()

if not existing_files:
    print()
    print("❌ No output files found! Cannot create ZIP.")
    print("   Please run the NHANES extractor first.")
    raise FileNotFoundError("No NHANES output files found")

# --- Check raw data folder ---
raw_data_count = 0
if INCLUDE_RAW_DATA and os.path.exists(data_dir):
    raw_data_count = len([f for f in os.listdir(data_dir) if f.endswith('.csv')])
    print(f"\n📁 Found {raw_data_count} raw data files in nhanes_data/")
    print(f"   (Including raw data: {INCLUDE_RAW_DATA})")

print()
print("="*80)
print("Compressing files...")
print("="*80)
print()

# --- Create the ZIP file ---
files_added = 0
total_size = 0

with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add main output files
    for filepath in existing_files:
        filename = os.path.basename(filepath)
        zipf.write(filepath, filename)
        file_size = os.path.getsize(filepath)
        total_size += file_size
        files_added += 1
        print(f"  ✓ Added: {filename} ({file_size / (1024*1024):.1f} MB)")

    # Optionally add raw data files
    if INCLUDE_RAW_DATA and os.path.exists(data_dir):
        print()
        print(f"  Adding {raw_data_count} raw data files...")
        print()

        for filename in os.listdir(data_dir):
            if filename.endswith('.csv'):
                filepath = os.path.join(data_dir, filename)
                arcname = os.path.join('nhanes_data', filename)
                zipf.write(filepath, arcname)
                files_added += 1

                # Progress indicator
                if files_added % 20 == 0:
                    print(f"    Progress: {files_added} files...", end='\r')

        print(f"    Progress: {files_added} files... Done!")

print()
print("="*80)
print("✅ COMPRESSION COMPLETE!")
print("="*80)

# --- Display statistics ---
zip_size_mb = os.path.getsize(output_zip) / (1024 * 1024)
compression_ratio = (1 - zip_size_mb / (total_size / (1024*1024))) * 100

print()
print(f"📊 Statistics:")
print(f"  • Files compressed: {files_added}")
print(f"  • Original size: {total_size / (1024*1024):.1f} MB")
print(f"  • Compressed size: {zip_size_mb:.1f} MB")
print(f"  • Compression ratio: {compression_ratio:.1f}%")
print()
print(f"📥 Output file: {zip_name}")
print(f"📂 Location: {output_dir}")
print()

# --- Summary of what's included ---
print("="*80)
print("📋 CONTENTS SUMMARY")
print("="*80)
print()
print("Your ZIP file includes:")
print()
print("1. 🧬 nhanes_patient_flattened_enhanced.csv")
print("   → 134,701 patients × 1,979 variables")
print("   → Merged data from all cycles (1999-2023)")
print("   → Component prefixes: LAB_, QUEST_, EXAM_, DEMO_")
print()
print("2. 📊 nhanes_column_summary_enhanced.csv")
print("   → Metadata for all 1,979 variables")
print("   → Source file tracking and Doc URLs")
print()
print("3. 📖 nhanes_complete_dictionary.csv")
print("   → Variable descriptions from CDC documentation")
print("   → Value code mappings (1=Male, 2=Female, etc.)")
print("   → 96% coverage with real descriptions")
print()
print("4. 🏷️ nhanes_value_codes.csv")
print("   → Detailed value code reference")
print("   → ~7,876 code mappings")
print()
print("5. 📚 nhanes_catalog.csv")
print("   → Original 1,716 datasets discovered")
print()
print("6. 🎯 nhanes_ai_semantic_dynamic.csv")
print("   → 147 AI-filtered relevant datasets")
print()

if INCLUDE_RAW_DATA and raw_data_count > 0:
    print("7. 📁 nhanes_data/ folder")
    print(f"   → {raw_data_count} raw CSV files from Chunk 3")
    print()

print("="*80)
print("📥 DOWNLOAD YOUR DATA")
print("="*80)
print()

# --- Trigger download ---
try:
    from google.colab import files as colab_files
    print("Starting download...")
    colab_files.download(output_zip)
    print()
    print("✅ Download started successfully!")
    print()
    print("💡 TIP: If the download didn't start automatically,")
    print("   you can find the file in the 'outputs' folder")
    print("   in your Colab file browser (left sidebar).")
except Exception as e:
    print("⚠️  Automatic download failed.")
    print()
    print("📂 You can manually download from:")
    print(f"   {output_zip}")
    print()
    print("   (Open the file browser on the left sidebar,")
    print("    navigate to 'outputs' folder, and download)")
    print()
    print(f"Error: {e}")

print()
print("="*80)
print("🎉 ALL DONE!")
print("="*80)
print()
print("Your NHANES data is ready for analysis!")
print("Use the dictionary and value codes files to understand the data.")
print()

📦 COMPRESSING NHANES DATA FOR DOWNLOAD

📂 Source directory: /content
💾 Output file: nhanes_complete_outputs_2025-11-02_04-05-25.zip

Checking for output files...
  ✓ nhanes_patient_flattened_enhanced.csv (314.8 MB)
  ✓ nhanes_column_summary_enhanced.csv (0.5 MB)
  ✓ nhanes_complete_dictionary.csv (0.4 MB)
  ✓ nhanes_value_codes.csv (0.1 MB)
  ✓ nhanes_catalog.csv (0.4 MB)
  ✓ nhanes_ai_semantic_dynamic.csv (0.0 MB)

Compressing files...

  ✓ Added: nhanes_patient_flattened_enhanced.csv (314.8 MB)
  ✓ Added: nhanes_column_summary_enhanced.csv (0.5 MB)
  ✓ Added: nhanes_complete_dictionary.csv (0.4 MB)
  ✓ Added: nhanes_value_codes.csv (0.1 MB)
  ✓ Added: nhanes_catalog.csv (0.4 MB)
  ✓ Added: nhanes_ai_semantic_dynamic.csv (0.0 MB)

✅ COMPRESSION COMPLETE!

📊 Statistics:
  • Files compressed: 6
  • Original size: 316.1 MB
  • Compressed size: 26.8 MB
  • Compression ratio: 91.5%

📥 Output file: nhanes_complete_outputs_2025-11-02_04-05-25.zip
📂 Location: /content/outputs

📋 CONTENTS SUMM

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Download started successfully!

💡 TIP: If the download didn't start automatically,
   you can find the file in the 'outputs' folder
   in your Colab file browser (left sidebar).

🎉 ALL DONE!

Your NHANES data is ready for analysis!
Use the dictionary and value codes files to understand the data.



In [None]:
import os
if os.path.exists("nhanes_scrape_cache.json"):
    os.remove("nhanes_scrape_cache.json")
    print("✓ Cache cleared!")

✓ Cache cleared!
