<a href="https://colab.research.google.com/github/ConstructoDestructo/Diabetes_AI_Instrument/blob/main/CDC_NHANES_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
!pip install kneed
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from kneed import KneeLocator
import numpy as np
import matplotlib.pyplot as plt

# --- CONFIG ---
BASE_CYCLE_INDEX = "https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx"
COMPONENTS = ["Demographics", "Dietary", "Examination", "Laboratory", "Questionnaire"]
OUTPUT_JSON = "nhanes_catalog.json"
OUTPUT_CSV = "nhanes_catalog.csv"
HEADERS = {"User-Agent": "Mozilla/5.0 (Data Research Bot; +https://example.com)"}
EXCLUDE = ["What We Eat in America", "Notice to Users"]

# --- Helper functions ---
def get_available_cycles():
    res = requests.get(BASE_CYCLE_INDEX, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    cycles = []
    for link in soup.find_all("a", href=True):
        href = link["href"]
        link_text = link.text.strip()
        if ("BeginYear=" in href or "Cycle=" in href) and "NHANES" in link_text:
            if any(exclude in link_text for exclude in EXCLUDE):
                continue
            if link_text not in cycles:
                cycles.append(link_text)
    return cycles

def get_component_datasets(cycle, component):
    url = f"https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={component}&Cycle={cycle}"
    res = requests.get(url, headers=HEADERS)
    if res.status_code != 200:
        print(f"⚠️ Failed to fetch {component} data for {cycle} (status {res.status_code})")
        return []

    soup = BeautifulSoup(res.text, "html.parser")
    table = soup.find("table", {"id": "GridView1"})
    if not table:
        return []

    cycle_years_match = re.findall(r'\d{4}', cycle)
    if len(cycle_years_match) == 2:
        cycle_start_year = int(cycle_years_match[0])
        cycle_end_year = int(cycle_years_match[1])
    elif len(cycle_years_match) == 1:
        cycle_start_year = cycle_end_year = int(cycle_years_match[0])
    else:
        cycle_start_year = cycle_end_year = None

    records = []
    rows = table.find_all("tr")[1:]  # skip header
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 4:
            continue
        dataset_name = cols[0].text.strip()
        years = cols[1].text.strip()
        data_link = cols[2].find("a")["href"] if cols[2].find("a") else None
        doc_link = cols[3].find("a")["href"] if cols[3].find("a") else None

        if re.match(r'\d{4}-\d{4}', dataset_name):
            dataset_name, years = years, dataset_name

        if data_link and data_link.startswith("/"):
            data_link = "https://wwwn.cdc.gov" + data_link
        if doc_link and doc_link.startswith("/"):
            doc_link = "https://wwwn.cdc.gov" + doc_link

        dataset_years_match = re.search(r'(\d{4})-(\d{4})', years)
        if dataset_years_match:
            dataset_start_year = int(dataset_years_match.group(1))
            dataset_end_year = int(dataset_years_match.group(2))
            if cycle_start_year and cycle_end_year:
                if dataset_start_year < cycle_start_year or dataset_end_year > cycle_end_year:
                    continue

        print(f"   [Found] {dataset_name} ({years})")
        records.append({
            "cycle": cycle,
            "component": component,
            "dataset_name": dataset_name,
            "years": years,
            "data_url": doc_link,  # swapped on purpose
            "doc_url": data_link
        })
    return records

# --- Crawl NHANES ---
def crawl_nhanes():
    all_records = []
    cycles = get_available_cycles()
    print(f"✅ Found {len(cycles)} valid NHANES cycles: {cycles}")

    for cycle in tqdm(cycles, desc="Crawling NHANES cycles"):
        print(f"\n🔹 Scanning cycle: {cycle}")

        # Always include Demographics first
        try:
            demographics_datasets = get_component_datasets(cycle, "Demographics")
            if demographics_datasets:
                all_records.extend(demographics_datasets)
                print(f"   [Added] Demographics dataset for {cycle}")
            time.sleep(1)
        except Exception as e:
            print(f"⚠️ Error fetching demographics for {cycle}: {e}")

        # Add the rest
        for component in COMPONENTS:
            if component == "Demographics":
                continue
            try:
                datasets = get_component_datasets(cycle, component)
                all_records.extend(datasets)
                time.sleep(1)
            except Exception as e:
                print(f"⚠️ Error on {cycle} - {component}: {e}")

    df = pd.DataFrame(all_records)
    df.to_json(OUTPUT_JSON, orient="records", indent=2)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n✅ Crawling complete! Saved {len(df)} records to {OUTPUT_JSON} and {OUTPUT_CSV}.")
    return df

# --- Run crawler ---
if __name__ == "__main__":
    df_catalog = crawl_nhanes()

   # --- Separate demographics first ---
df_demographics = df_catalog[df_catalog["component"] == "Demographics"].copy()
df_non_demographics = df_catalog[df_catalog["component"] != "Demographics"].copy()

print(f"🔹 Demographics datasets preserved: {len(df_demographics)}")
print(f"🔹 Non-demographics datasets to filter: {len(df_non_demographics)}")

# --- Step 1: AI Filter: Biomedical Expansion on non-demographics datasets ---
SEED_KEYWORDS = [
    "diabetes", "prediabetes", "glucose", "fasting glucose", "hba1c",
    "hemoglobin a1c", "insulin", "c-peptide", "triglyceride", "cholesterol",
    "hdl", "ldl", "body mass index", "bmi", "waist", "weight", "obesity",
    "hypertension"
]

BIOMED_SYNONYMS = {
    "diabetes": ["type 2 diabetes", "type 1 diabetes", "hyperglycemia", "impaired glucose tolerance", "diabetic", "glucose intolerance"],
    "prediabetes": ["impaired fasting glucose", "impaired glucose tolerance", "borderline diabetes"],
    "glucose": ["blood sugar", "serum glucose", "fasting glucose", "plasma glucose"],
    "hba1c": ["hemoglobin a1c", "glycated hemoglobin", "glycohemoglobin", "a1c"],
    "insulin": ["fasting insulin", "serum insulin", "insulin resistance", "c-peptide"],
    "cholesterol": ["hdl", "ldl", "triglyceride", "lipids", "total cholesterol"],
    "obesity": ["overweight", "adiposity", "body mass index", "bmi", "waist circumference"],
    "hypertension": ["high blood pressure", "systolic", "diastolic"],
    "bmi": ["body mass index", "obesity indicator"],
    "triglyceride": ["blood lipids", "fatty acids", "serum triglycerides"],
    "hdl": ["good cholesterol"],
    "ldl": ["bad cholesterol"],
}

def expand_biomedical_keywords(seed_keywords, synonym_map):
    expanded = set(seed_keywords)
    for kw in seed_keywords:
        if kw.lower() in synonym_map:
            expanded.update(synonym_map[kw.lower()])
    return list(expanded)

expanded_keywords = expand_biomedical_keywords(SEED_KEYWORDS, BIOMED_SYNONYMS)
print(f"✅ Biomedical expansion complete: {len(expanded_keywords)} keywords")

NOISE_WORDS = set([
    "exam", "second", "file", "test", "data", "sheet", "survey",
    "questionnaire", "study", "participant", "demographics", "sample",
    "component"
])
dataset_texts = (df_non_demographics["dataset_name"] + " " + df_non_demographics["years"]).tolist()

# --- First-pass TF-IDF ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf_matrix = vectorizer.fit_transform(dataset_texts + [" ".join(expanded_keywords)])
seed_vector = tfidf_matrix[-1].toarray().flatten()
vocab = vectorizer.get_feature_names_out()
top_indices = seed_vector.argsort()[-50:][::-1]
first_pass_candidates = [vocab[i] for i in top_indices if vocab[i].lower() not in NOISE_WORDS]
first_pass_candidates = list(dict.fromkeys(first_pass_candidates))

# --- Second-pass TF-IDF ---
second_pass_doc = " ".join(expanded_keywords + first_pass_candidates)
vectorizer2 = TfidfVectorizer(ngram_range=(1,1))
tfidf_matrix2 = vectorizer2.fit_transform(dataset_texts + [second_pass_doc])
second_vector = tfidf_matrix2[-1].toarray().flatten()
vocab2 = vectorizer2.get_feature_names_out()
top_indices2 = second_vector.argsort()[-50:][::-1]
second_pass_candidates = [vocab2[i] for i in top_indices2 if vocab2[i].lower() not in NOISE_WORDS]
second_pass_candidates = list(dict.fromkeys(second_pass_candidates))

# --- Combine candidates with expanded keywords ---
final_keywords = list(dict.fromkeys(expanded_keywords + first_pass_candidates + second_pass_candidates))
keyword_doc = " ".join(expanded_keywords * 3 + first_pass_candidates + second_pass_candidates)
vectorizer_final = TfidfVectorizer(ngram_range=(1,1))
tfidf_final = vectorizer_final.fit_transform(dataset_texts + [keyword_doc])
dataset_matrix_final = tfidf_final[:-1]
keyword_vector_final = tfidf_final[-1]
similarities = cosine_similarity(dataset_matrix_final, keyword_vector_final)
df_non_demographics["relevance_score"] = similarities.flatten()

# --- Filter datasets using a threshold ---
threshold = 0.05
df_ai_filtered = df_non_demographics[df_non_demographics["relevance_score"] >= threshold].sort_values(by="relevance_score", ascending=False)
print(f"\n✅ AI filter applied: {len(df_ai_filtered)} datasets selected (excluding demographics)")

# --- Step 2: Semantic Refinement ---
from sentence_transformers import SentenceTransformer
from kneed import KneeLocator
import matplotlib.pyplot as plt
import numpy as np

apply_semantic_refinement = True
min_datasets = 145
plot_threshold = True

if apply_semantic_refinement:
    df_step2 = df_ai_filtered.copy()

    print("🧠 Computing sentence embeddings...")
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    dataset_texts_step2 = (df_step2["dataset_name"] + " " + df_step2["years"]).tolist()
    embeddings = model.encode(dataset_texts_step2, batch_size=32, show_progress_bar=True)
    keyword_embedding = model.encode(" ".join(final_keywords))

    cosine_similarities = np.dot(embeddings, keyword_embedding) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(keyword_embedding)
    )
    df_step2["semantic_score"] = cosine_similarities

    # Dynamic threshold via KneeLocator
    scores_sorted = np.sort(df_step2["semantic_score"].values)[::-1]
    x = np.arange(len(scores_sorted))
    y = scores_sorted
    knee = KneeLocator(x, y, curve='convex', direction='decreasing')

    if knee.knee is not None:
        dynamic_threshold = y[knee.knee]
    else:
        dynamic_threshold = 0.0

    df_dynamic_filtered = df_step2[df_step2["semantic_score"] >= dynamic_threshold]

    if len(df_dynamic_filtered) < min_datasets:
        df_dynamic_filtered = df_step2.sort_values(by="semantic_score", ascending=False).iloc[:min_datasets]

    df_dynamic_filtered = df_dynamic_filtered.sort_values(by="semantic_score", ascending=False)

# --- Merge demographics back ---
df_final = pd.concat([df_dynamic_filtered, df_demographics], ignore_index=True)
df_final.to_csv("nhanes_ai_semantic_dynamic.csv", index=False)
print(f"✅ Dynamic semantic refinement complete: {len(df_final)} datasets saved → nhanes_ai_semantic_dynamic.csv")





















Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl.metadata (5.5 kB)
Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5
✅ Found 13 valid NHANES cycles: ['NHANES 08/2021-08/2023', 'NHANES 2017-March 2020', 'NHANES 2019-2020', 'NHANES 2017-2018', 'NHANES 2015-2016', 'NHANES 2013-2014', 'NHANES 2011-2012', 'NHANES 2009-2010', 'NHANES 2007-2008', 'NHANES 2005-2006', 'NHANES 2003-2004', 'NHANES 2001-2002', 'NHANES 1999-2000']


Crawling NHANES cycles:   0%|          | 0/13 [00:00<?, ?it/s]


🔹 Scanning cycle: NHANES 08/2021-08/2023
   [Found] Demographic Variables and Sample Weights (2021-2023)
   [Added] Demographics dataset for NHANES 08/2021-08/2023
   [Found] Dietary Interview - Individual Foods, First Day (2021-2023)
   [Found] Dietary Interview - Individual Foods, Second Day (2021-2023)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2021-2023)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2021-2023)
   [Found] Dietary Interview Technical Support File - Food Codes (2021-2023)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2021-2023)
   [Found] Dietary Supplement Use 30-Day - Total Dietary Supplements (2021-2023)
   [Found] Balance (2021-2023)
   [Found] Blood Pressure - Oscillometric Measurements (2021-2023)
   [Found] Body Measures (2021-2023)
   [Found] Liver Ultrasound Transient Elastography (2021-2023)
   [Found] Albumin & Creatinine - Urine (2021-2023)
   [Found] alpha-1-Acid Glycoprotein (2021-

Crawling NHANES cycles:   8%|▊         | 1/13 [00:07<01:29,  7.45s/it]


🔹 Scanning cycle: NHANES 2017-March 2020
   [Found] Demographic Variables and Sample Weights (2017-2018)
   [Found] Demographic Variables and Sample Weights (2017-2020)
   [Added] Demographics dataset for NHANES 2017-March 2020
   [Found] Dietary Interview - Individual Foods, First Day (2017-2018)
   [Found] Dietary Interview - Individual Foods, First Day (2017-2020)
   [Found] Dietary Interview - Individual Foods, Second Day (2017-2018)
   [Found] Dietary Interview - Individual Foods, Second Day (2017-2020)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2017-2020)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2017-2020)
   [Found] Dietary Interview Technical Support File - Food Codes (2017-2018)
   [Found] Dietary Interview Technical Support File - Food Codes (2017-2020)
   [Found] Dietary Su

Crawling NHANES cycles:  15%|█▌        | 2/13 [00:15<01:26,  7.82s/it]


🔹 Scanning cycle: NHANES 2019-2020


Crawling NHANES cycles:  23%|██▎       | 3/13 [00:24<01:23,  8.32s/it]


🔹 Scanning cycle: NHANES 2017-2018
   [Found] Demographic Variables and Sample Weights (2017-2018)
   [Added] Demographics dataset for NHANES 2017-2018
   [Found] Dietary Interview - Individual Foods, First Day (2017-2018)
   [Found] Dietary Interview - Individual Foods, Second Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2017-2018)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2017-2018)
   [Found] Dietary Interview Technical Support File - Food Codes (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2017-2018)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2017-2018)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2017-2018)
  

Crawling NHANES cycles:  31%|███       | 4/13 [00:32<01:12,  8.05s/it]


🔹 Scanning cycle: NHANES 2015-2016
   [Found] Demographic Variables and Sample Weights (2015-2016)
   [Added] Demographics dataset for NHANES 2015-2016
   [Found] Dietary Interview - Individual Foods, First Day (2015-2016)
   [Found] Dietary Interview - Individual Foods, Second Day (2015-2016)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2015-2016)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2015-2016)
   [Found] Dietary Interview Technical Support File - Food Codes (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2015-2016)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2015-2016)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2015-2016)
  

Crawling NHANES cycles:  38%|███▊      | 5/13 [00:40<01:05,  8.23s/it]


🔹 Scanning cycle: NHANES 2013-2014
   [Found] Demographic Variables and Sample Weights (2013-2014)
   [Added] Demographics dataset for NHANES 2013-2014
   [Found] Dietary Interview - Individual Foods, First Day (2013-2014)
   [Found] Dietary Interview - Individual Foods, Second Day (2013-2014)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2013-2014)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2013-2014)
   [Found] Dietary Interview Technical Support File - Food Codes (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2013-2014)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2013-2014)
   [Found] Dietary Supplement Use 30-Day - Individual Dietary Supplements (2013-2014)
  

Crawling NHANES cycles:  46%|████▌     | 6/13 [00:49<00:58,  8.32s/it]


🔹 Scanning cycle: NHANES 2011-2012
   [Found] Demographic Variables & Sample Weights (2011-2012)
   [Added] Demographics dataset for NHANES 2011-2012
   [Found] Dietary Interview - Individual Foods, First Day (2011-2012)
   [Found] Dietary Interview - Individual Foods, Second Day (2011-2012)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2011-2012)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2011-2012)
   [Found] Dietary Interview Technical Support File - Food Codes (2011-2012)
   [Found] Dietary Interview Technical Support File - Modification Codes (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2011-2012)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2011-2012)
   [F

Crawling NHANES cycles:  54%|█████▍    | 7/13 [00:58<00:51,  8.52s/it]


🔹 Scanning cycle: NHANES 2009-2010
   [Found] Demographic Variables & Sample Weights (2009-2010)
   [Added] Demographics dataset for NHANES 2009-2010
   [Found] Dietary Interview - Individual Foods, First Day (2009-2010)
   [Found] Dietary Interview - Individual Foods, Second Day (2009-2010)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2009-2010)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2009-2010)
   [Found] Dietary Interview Technical Support File - Food Codes (2009-2010)
   [Found] Dietary Interview Technical Support File - Modification Codes (2009-2010)
   [Found] Dietary Screener Questionnaire (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2009-2010)
   [Found] Dietary Supplement Use 24-Hour - T

Crawling NHANES cycles:  62%|██████▏   | 8/13 [01:06<00:42,  8.51s/it]


🔹 Scanning cycle: NHANES 2007-2008
   [Found] Demographic Variables & Sample Weights (2007-2008)
   [Added] Demographics dataset for NHANES 2007-2008
   [Found] Dietary Interview - Individual Foods, First Day (2007-2008)
   [Found] Dietary Interview - Individual Foods, Second Day (2007-2008)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2007-2008)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2007-2008)
   [Found] Dietary Interview Technical Support File - Food Codes (2007-2008)
   [Found] Dietary Interview Technical Support File - Modification Codes (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, First Day (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Individual Dietary Supplements, Second Day (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, First Day (2007-2008)
   [Found] Dietary Supplement Use 24-Hour - Total Dietary Supplements, Second Day (2007-2008)
   [F

Crawling NHANES cycles:  69%|██████▉   | 9/13 [01:15<00:33,  8.50s/it]


🔹 Scanning cycle: NHANES 2005-2006
   [Found] Demographic Variables & Sample Weights (2005-2006)
   [Added] Demographics dataset for NHANES 2005-2006
   [Found] Dietary Interview - Individual Foods, First Day (2005-2006)
   [Found] Dietary Interview - Individual Foods, Second Day (2005-2006)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2005-2006)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2005-2006)
   [Found] Dietary Interview Technical Support File - Food Codes (2005-2006)
   [Found] Dietary Interview Technical Support File - Modification Codes (2005-2006)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (2005-2006)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (2005-2006)
   [Found] Food Frequency Questionnaire - Look-Up Table FOODLOOK (2005-2006)
   [Found] Food Frequency Questionnaire - Look-Up Table VARLOOK (2005-2006)
   [Found] Food Frequency Questionnaire - Output from DietC

Crawling NHANES cycles:  77%|███████▋  | 10/13 [01:23<00:25,  8.40s/it]


🔹 Scanning cycle: NHANES 2003-2004
   [Found] Demographic Variables & Sample Weights (2003-2004)
   [Added] Demographics dataset for NHANES 2003-2004
   [Found] Dietary Interview - Individual Foods, First Day (2003-2004)
   [Found] Dietary Interview - Individual Foods, Second Day (2003-2004)
   [Found] Dietary Interview - Total Nutrient Intakes, First Day (2003-2004)
   [Found] Dietary Interview - Total Nutrient Intakes, Second Day (2003-2004)
   [Found] Dietary Interview Technical Support File - Food Codes (2003-2004)
   [Found] Dietary Interview Technical Support File - Modification Codes (2003-2004)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (2003-2004)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (2003-2004)
   [Found] Food Frequency Questionnaire - Look-Up Table FOODLOOK (2003-2004)
   [Found] Food Frequency Questionnaire - Look-Up Table VARLOOK (2003-2004)
   [Found] Food Frequency Questionnaire - Output from DietC

Crawling NHANES cycles:  85%|████████▍ | 11/13 [01:31<00:16,  8.23s/it]


🔹 Scanning cycle: NHANES 2001-2002
   [Found] Demographic Variables & Sample Weights (2001-2002)
   [Added] Demographics dataset for NHANES 2001-2002
   [Found] Dietary Interview - Individual Foods (2001-2002)
   [Found] Dietary Interview - Total Nutrient Intakes (2001-2002)
   [Found] Dietary Interview Technical Support File - Food Code Format File (2001-2002)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (2001-2002)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (2001-2002)
   [Found] Audiometry (2001-2002)
   [Found] Audiometry - Acoustic Reflex (2001-2002)
   [Found] Audiometry - Tympanometry (2001-2002)
   [Found] Balance (2001-2002)
   [Found] Bioelectrical Impedance Analysis (2001-2002)
   [Found] Blood Pressure (2001-2002)
   [Found] Body Measures (2001-2002)
   [Found] Cardiovascular Fitness (2001-2002)
   [Found] Dual-Energy X-ray Absorptiometry - Whole Body, Second Exam (2001-2002)
   [Found] Lower Extremity Diseas

Crawling NHANES cycles:  92%|█████████▏| 12/13 [01:39<00:08,  8.15s/it]


🔹 Scanning cycle: NHANES 1999-2000
   [Found] Demographic Variables & Sample Weights (1999-2000)
   [Added] Demographics dataset for NHANES 1999-2000
   [Found] Dietary Interview - Individual Foods (1999-2000)
   [Found] Dietary Interview - Total Nutrient Intakes (1999-2000)
   [Found] Dietary Interview Technical Support File - Food Code Format File (1999-2000)
   [Found] Dietary Supplement Use 30-Day - File 1, Supplement Counts (1999-2000)
   [Found] Dietary Supplement Use 30-Day - File 2, Participant's Use of Supplements (1999-2000)
   [Found] Audiometry (1999-2000)
   [Found] Audiometry - Acoustic Reflex (1999-2000)
   [Found] Audiometry - Tympanometry (1999-2000)
   [Found] Balance (1999-2000)
   [Found] Bioelectrical Impedance Analysis (1999-2000)
   [Found] Blood Pressure (1999-2000)
   [Found] Body Measures (1999-2000)
   [Found] Cardiovascular Fitness (1999-2000)
   [Found] Lower Extremity Disease - Ankle Brachial Blood Pressure Index (1999-2000)
   [Found] Lower Extremity Dis

Crawling NHANES cycles: 100%|██████████| 13/13 [01:46<00:00,  8.23s/it]



✅ Crawling complete! Saved 1716 records to nhanes_catalog.json and nhanes_catalog.csv.
🔹 Demographics datasets preserved: 13
🔹 Non-demographics datasets to filter: 1703
✅ Biomedical expansion complete: 49 keywords

✅ AI filter applied: 189 datasets selected (excluding demographics)
🧠 Computing sentence embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

✅ Dynamic semantic refinement complete: 158 datasets saved → nhanes_ai_semantic_dynamic.csv


In [None]:
import os
!pip install pyreadstat
import requests
import pyreadstat
import pandas as pd
from tqdm import tqdm
import time

OUTPUT_DIR = "nhanes_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df_filtered = pd.read_csv("nhanes_ai_semantic_dynamic.csv")

MAX_RETRIES = 5
RETRY_DELAY = 2

for idx, row in tqdm(df_filtered.iterrows(), total=len(df_filtered), desc="Downloading NHANES datasets"):
    dataset_name = row["dataset_name"]
    xpt_url = row["data_url"]

    if not isinstance(xpt_url, str) or not xpt_url.strip():
        print(f"⚠️ Skipping {dataset_name}: No URL provided")
        continue

    # Clean URL thoroughly
    xpt_url = xpt_url.strip().replace("\n", "").replace("\r", "")
    if xpt_url.startswith("/"):
        xpt_url = "https://wwwn.cdc.gov" + xpt_url

    # Use cycle/component in filename to prevent collisions
    cycle = row.get("cycle", "UnknownCycle")
    component = row.get("component", "UnknownComponent")
    filename = f"{cycle}_{component}_{dataset_name}".replace("/", "_").replace(" ", "_") + ".csv"
    filepath = os.path.join(OUTPUT_DIR, filename)

    if os.path.exists(filepath):
        continue

    success = False
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.get(xpt_url, timeout=30, allow_redirects=True)
            resp.raise_for_status()

            # Save temp .XPT
            with open("temp.xpt", "wb") as f:
                f.write(resp.content)

                # Convert to CSV
                df_xpt, meta = pyreadstat.read_xport("temp.xpt", encoding="latin1")

            # Add metadata
            df_xpt["NHANES_Cycle"] = cycle
            df_xpt["Component"] = component
            df_xpt["Dataset_Name"] = dataset_name
            df_xpt["Doc_URL"] = row.get("doc_url", "")

            df_xpt.to_csv(filepath, index=False)
            os.remove("temp.xpt")

            print(f"✅ Saved {filename}")
            success = True
            time.sleep(0.2)
            break

        except Exception as e:
            print(f"⚠️ Attempt {attempt} failed for {dataset_name}: {e}")
            time.sleep(RETRY_DELAY)

    if not success:
        print(f"❌ Failed to download {dataset_name}")

print(f"\n✅ Download complete. Files saved in '{OUTPUT_DIR}'")








Collecting pyreadstat
  Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.2 kB)
Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (666 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m666.4/666.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.3.1


Downloading NHANES datasets:   0%|          | 0/158 [00:00<?, ?it/s]

✅ Saved NHANES_2001-2002_Laboratory_Glycohemoglobin,_Plasma_Glucose,_Serum_C-peptide,_&_Insulin,_Second_Exam.csv


Downloading NHANES datasets:   1%|▏         | 2/158 [00:00<01:15,  2.05it/s]

✅ Saved NHANES_2001-2002_Laboratory_Plasma_Fasting_Glucose,_Serum_C-peptide_&_Insulin.csv


Downloading NHANES datasets:   2%|▏         | 3/158 [00:01<01:05,  2.35it/s]

✅ Saved NHANES_2001-2002_Laboratory_Cholesterol_-_Total,_HDL,_LDL__&_Triglycerides,_Second_Exam.csv


Downloading NHANES datasets:   3%|▎         | 4/158 [00:01<01:07,  2.29it/s]

✅ Saved NHANES_2003-2004_Laboratory_Plasma_Fasting_Glucose,_Serum_C-peptide_&_Insulin.csv


Downloading NHANES datasets:   3%|▎         | 5/158 [00:02<01:11,  2.14it/s]

✅ Saved NHANES_1999-2000_Laboratory_Plasma_Fasting_Glucose,_Serum_C-peptide_&_Insulin.csv


Downloading NHANES datasets:   4%|▍         | 6/158 [00:02<01:11,  2.12it/s]

✅ Saved NHANES_2013-2014_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv
✅ Saved NHANES_2009-2010_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv


Downloading NHANES datasets:   5%|▌         | 8/158 [00:03<01:07,  2.24it/s]

✅ Saved NHANES_2011-2012_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv


Downloading NHANES datasets:   6%|▌         | 9/158 [00:04<01:05,  2.29it/s]

✅ Saved NHANES_2007-2008_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv


Downloading NHANES datasets:   6%|▋         | 10/158 [00:04<01:05,  2.27it/s]

✅ Saved NHANES_2011-2012_Laboratory_Plasma_Fasting_Glucose_&_Insulin.csv


Downloading NHANES datasets:   7%|▋         | 11/158 [00:04<01:05,  2.24it/s]

✅ Saved NHANES_2009-2010_Laboratory_Plasma_Fasting_Glucose_&_Insulin.csv


Downloading NHANES datasets:   8%|▊         | 12/158 [00:05<01:04,  2.27it/s]

✅ Saved NHANES_2005-2006_Laboratory_Cholesterol_-_LDL,_Triglyceride_&_Apoliprotein_(ApoB).csv


Downloading NHANES datasets:   8%|▊         | 13/158 [00:05<01:03,  2.29it/s]

✅ Saved NHANES_2005-2006_Laboratory_Plasma_Fasting_Glucose_&_Insulin.csv


Downloading NHANES datasets:   9%|▉         | 14/158 [00:06<01:05,  2.21it/s]

✅ Saved NHANES_2007-2008_Laboratory_Plasma_Fasting_Glucose_&_Insulin.csv


Downloading NHANES datasets:   9%|▉         | 15/158 [00:07<01:16,  1.88it/s]

✅ Saved NHANES_2005-2006_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  10%|█         | 16/158 [00:07<01:12,  1.97it/s]

✅ Saved NHANES_2001-2002_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv


Downloading NHANES datasets:  11%|█         | 17/158 [00:08<01:15,  1.88it/s]

✅ Saved NHANES_2017-2018_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  11%|█▏        | 18/158 [00:08<01:18,  1.79it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  12%|█▏        | 19/158 [00:09<01:23,  1.67it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  13%|█▎        | 20/158 [00:10<01:30,  1.52it/s]

✅ Saved NHANES_2011-2012_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  13%|█▎        | 21/158 [00:11<02:06,  1.08it/s]

✅ Saved NHANES_1999-2000_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  14%|█▍        | 22/158 [00:12<02:02,  1.11it/s]

✅ Saved NHANES_2007-2008_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  15%|█▍        | 23/158 [00:13<01:50,  1.22it/s]

✅ Saved NHANES_2015-2016_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  15%|█▌        | 24/158 [00:13<01:35,  1.41it/s]

✅ Saved NHANES_1999-2000_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv


Downloading NHANES datasets:  16%|█▌        | 25/158 [00:14<01:25,  1.55it/s]

✅ Saved NHANES_2003-2004_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv


Downloading NHANES datasets:  16%|█▋        | 26/158 [00:14<01:26,  1.53it/s]

✅ Saved NHANES_2013-2014_Laboratory_Plasma_Fasting_Glucose.csv


Downloading NHANES datasets:  17%|█▋        | 27/158 [00:15<01:34,  1.38it/s]

✅ Saved NHANES_2009-2010_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  18%|█▊        | 28/158 [00:17<02:13,  1.02s/it]

✅ Saved NHANES_2013-2014_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  18%|█▊        | 29/158 [00:18<02:11,  1.02s/it]

✅ Saved NHANES_2001-2002_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  20%|█▉        | 31/158 [00:18<01:23,  1.51it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Cholesterol_-_Low-Density_Lipoproteins_(LDL)_&_Triglycerides.csv


Downloading NHANES datasets:  20%|██        | 32/158 [00:19<01:22,  1.53it/s]

✅ Saved NHANES_2003-2004_Laboratory_Fasting_Questionnaire.csv


Downloading NHANES datasets:  21%|██        | 33/158 [00:20<01:37,  1.28it/s]

✅ Saved NHANES_2013-2014_Questionnaire_Diabetes.csv
✅ Saved NHANES_2015-2016_Laboratory_Cholesterol_-_Low_-_Density_Lipoprotein_(LDL)_&_Triglycerides.csv


Downloading NHANES datasets:  22%|██▏       | 35/158 [00:22<01:33,  1.31it/s]

✅ Saved NHANES_2007-2008_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  23%|██▎       | 36/158 [00:22<01:34,  1.30it/s]

✅ Saved NHANES_2013-2014_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  23%|██▎       | 37/158 [00:24<01:51,  1.08it/s]

✅ Saved NHANES_2011-2012_Questionnaire_Diabetes.csv
✅ Saved NHANES_2017-2018_Laboratory_Plasma_Fasting_Glucose.csv


Downloading NHANES datasets:  25%|██▍       | 39/158 [00:25<01:20,  1.49it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Plasma_Fasting_Glucose.csv


Downloading NHANES datasets:  25%|██▌       | 40/158 [00:25<01:09,  1.69it/s]

✅ Saved NHANES_2015-2016_Laboratory_Plasma_Fasting_Glucose.csv


Downloading NHANES datasets:  27%|██▋       | 42/158 [00:25<00:48,  2.40it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Cholesterol_-_Low-Density_Lipoproteins_(LDL)_&_Triglycerides.csv


Downloading NHANES datasets:  27%|██▋       | 43/158 [00:26<00:48,  2.40it/s]

✅ Saved NHANES_2017-2018_Laboratory_Cholesterol_-_Low-Density_Lipoproteins_(LDL)_&_Triglycerides.csv


Downloading NHANES datasets:  28%|██▊       | 44/158 [00:26<00:51,  2.23it/s]

✅ Saved NHANES_2011-2012_Questionnaire_Blood_Pressure_&_Cholesterol.csv
✅ Saved NHANES_2005-2006_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  29%|██▉       | 46/158 [00:28<01:01,  1.82it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Plasma_Fasting_Glucose.csv


Downloading NHANES datasets:  30%|███       | 48/158 [00:28<00:48,  2.29it/s]

✅ Saved NHANES_2009-2010_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  31%|███       | 49/158 [00:29<00:53,  2.04it/s]

✅ Saved NHANES_2009-2010_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  32%|███▏      | 50/158 [00:29<00:50,  2.15it/s]

✅ Saved NHANES_2013-2014_Laboratory_Insulin.csv


Downloading NHANES datasets:  32%|███▏      | 51/158 [00:30<00:51,  2.07it/s]

✅ Saved NHANES_2007-2008_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  33%|███▎      | 52/158 [00:31<01:05,  1.61it/s]

✅ Saved NHANES_2017-March_2020_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  34%|███▎      | 53/158 [00:32<01:14,  1.41it/s]

✅ Saved NHANES_2017-2018_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  34%|███▍      | 54/158 [00:32<01:07,  1.53it/s]

✅ Saved NHANES_2005-2006_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  35%|███▍      | 55/158 [00:33<01:01,  1.68it/s]

✅ Saved NHANES_2013-2014_Laboratory_Cholesterol_-_Total.csv
✅ Saved NHANES_2011-2012_Laboratory_Cholesterol_-_Total.csv


Downloading NHANES datasets:  37%|███▋      | 58/158 [00:35<01:12,  1.37it/s]

✅ Saved NHANES_2001-2002_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  37%|███▋      | 59/158 [00:36<01:14,  1.33it/s]

✅ Saved NHANES_1999-2000_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  38%|███▊      | 60/158 [00:36<01:10,  1.38it/s]

✅ Saved NHANES_2015-2016_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  39%|███▊      | 61/158 [00:38<01:24,  1.15it/s]

✅ Saved NHANES_2015-2016_Questionnaire_Diabetes.csv
✅ Saved NHANES_2013-2014_Laboratory_Fatty_Acids_-_Serum.csv


Downloading NHANES datasets:  40%|███▉      | 63/158 [00:39<01:15,  1.27it/s]

✅ Saved NHANES_1999-2000_Laboratory_Cholesterol_-_Total_&_HDL.csv
✅ Saved NHANES_2009-2010_Laboratory_Cholesterol_-_Total.csv


Downloading NHANES datasets:  41%|████      | 64/158 [00:40<01:05,  1.44it/s]

✅ Saved NHANES_2013-2014_Laboratory_Cholesterol_-_HDL.csv


Downloading NHANES datasets:  42%|████▏     | 66/158 [00:40<00:51,  1.77it/s]

✅ Saved NHANES_2005-2006_Laboratory_Cholesterol_-_Total.csv


Downloading NHANES datasets:  42%|████▏     | 67/158 [00:41<00:51,  1.78it/s]

✅ Saved NHANES_08_2021-08_2023_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  43%|████▎     | 68/158 [00:41<00:47,  1.90it/s]

✅ Saved NHANES_2007-2008_Laboratory_Cholesterol_-_Total.csv
✅ Saved NHANES_2003-2004_Laboratory_Fatty_Acids_-_Plasma_(Surplus).csv


Downloading NHANES datasets:  44%|████▍     | 70/158 [00:42<00:44,  1.98it/s]

✅ Saved NHANES_2017-2018_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  45%|████▍     | 71/158 [00:43<00:44,  1.97it/s]

✅ Saved NHANES_2017-March_2020_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  46%|████▌     | 72/158 [00:44<00:47,  1.81it/s]

✅ Saved NHANES_2003-2004_Questionnaire_Blood_Pressure_&_Cholesterol.csv


Downloading NHANES datasets:  46%|████▌     | 73/158 [00:44<00:43,  1.96it/s]

✅ Saved NHANES_2011-2012_Laboratory_Cholesterol_-_HDL.csv


Downloading NHANES datasets:  47%|████▋     | 74/158 [00:45<00:44,  1.90it/s]

✅ Saved NHANES_1999-2000_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  47%|████▋     | 75/158 [00:45<00:42,  1.94it/s]

✅ Saved NHANES_2001-2002_Laboratory_Cholesterol_-_Total_&_HDL.csv


Downloading NHANES datasets:  48%|████▊     | 76/158 [00:46<00:43,  1.86it/s]

✅ Saved NHANES_2001-2002_Questionnaire_Diabetes.csv
✅ Saved NHANES_2009-2010_Laboratory_Cholesterol_-_HDL.csv


Downloading NHANES datasets:  49%|████▉     | 78/158 [00:47<00:50,  1.57it/s]

✅ Saved NHANES_2011-2012_Laboratory_Fatty_Acids_-_Serum.csv


Downloading NHANES datasets:  50%|█████     | 79/158 [00:47<00:45,  1.73it/s]

✅ Saved NHANES_2005-2006_Laboratory_Cholesterol_-_HDL.csv


Downloading NHANES datasets:  51%|█████     | 80/158 [00:48<00:41,  1.88it/s]

✅ Saved NHANES_2007-2008_Laboratory_Oral_Glucose_Tolerance_Test.csv


Downloading NHANES datasets:  51%|█████▏    | 81/158 [00:49<00:44,  1.73it/s]

✅ Saved NHANES_2013-2014_Laboratory_Oral_Glucose_Tolerance_Test.csv
✅ Saved NHANES_2007-2008_Laboratory_Cholesterol_-_HDL.csv


Downloading NHANES datasets:  53%|█████▎    | 83/158 [00:49<00:32,  2.33it/s]

✅ Saved NHANES_2003-2004_Laboratory_Cholesterol_-_Total_&_HDL.csv


Downloading NHANES datasets:  53%|█████▎    | 84/158 [00:50<00:33,  2.19it/s]

✅ Saved NHANES_2009-2010_Laboratory_Oral_Glucose_Tolerance_Test.csv


Downloading NHANES datasets:  54%|█████▍    | 86/158 [00:51<00:41,  1.73it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Insulin.csv


Downloading NHANES datasets:  55%|█████▌    | 87/158 [00:52<00:37,  1.89it/s]

✅ Saved NHANES_2017-2018_Laboratory_Insulin.csv


Downloading NHANES datasets:  56%|█████▋    | 89/158 [00:52<00:26,  2.60it/s]

✅ Saved NHANES_2015-2016_Laboratory_Insulin.csv
✅ Saved NHANES_2003-2004_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  58%|█████▊    | 91/158 [00:53<00:30,  2.18it/s]

✅ Saved NHANES_2015-2016_Laboratory_Cholesterol_-_High-Density_Lipoprotein_(HDL).csv


Downloading NHANES datasets:  58%|█████▊    | 92/158 [00:54<00:29,  2.24it/s]

✅ Saved NHANES_2005-2006_Laboratory_Oral_Glucose_Tolerance_Test.csv


Downloading NHANES datasets:  59%|█████▉    | 93/158 [00:54<00:28,  2.25it/s]

✅ Saved NHANES_2011-2012_Laboratory_Oral_Glucose_Tolerance_Test.csv


Downloading NHANES datasets:  59%|█████▉    | 94/158 [00:55<00:29,  2.13it/s]

✅ Saved NHANES_2015-2016_Laboratory_Cholesterol_-_Total.csv
✅ Saved NHANES_08_2021-08_2023_Questionnaire_Diabetes.csv


Downloading NHANES datasets:  60%|██████    | 95/158 [00:55<00:30,  2.05it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Cholesterol_–_High-Density_Lipoprotein.csv


Downloading NHANES datasets:  61%|██████    | 96/158 [00:56<00:32,  1.91it/s]

✅ Saved NHANES_2015-2016_Laboratory_Oral_Glucose_Tolerance_Test.csv


Downloading NHANES datasets:  61%|██████▏   | 97/158 [00:56<00:30,  2.02it/s]

✅ Saved NHANES_2017-2018_Laboratory_Cholesterol_-_Total.csv


Downloading NHANES datasets:  63%|██████▎   | 99/158 [00:57<00:27,  2.16it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Cholesterol_-_Total.csv


Downloading NHANES datasets:  63%|██████▎   | 100/158 [00:57<00:26,  2.19it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Cholesterol_-_Total.csv


Downloading NHANES datasets:  64%|██████▍   | 101/158 [00:58<00:25,  2.22it/s]

✅ Saved NHANES_2017-2018_Laboratory_Cholesterol_-_High_-_Density_Lipoprotein_(HDL).csv


Downloading NHANES datasets:  65%|██████▍   | 102/158 [00:58<00:28,  1.98it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Cholesterol_-_High_-_Density_Lipoprotein_(HDL).csv
✅ Saved NHANES_2005-2006_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  67%|██████▋   | 106/158 [00:59<00:16,  3.08it/s]

✅ Saved NHANES_2007-2008_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  68%|██████▊   | 107/158 [01:00<00:17,  2.89it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Insulin.csv


Downloading NHANES datasets:  68%|██████▊   | 108/158 [01:00<00:18,  2.65it/s]

✅ Saved NHANES_2013-2014_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  69%|██████▉   | 109/158 [01:01<00:18,  2.61it/s]

✅ Saved NHANES_2009-2010_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  70%|██████▉   | 110/158 [01:01<00:22,  2.11it/s]

✅ Saved NHANES_2013-2014_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv


Downloading NHANES datasets:  70%|███████   | 111/158 [01:02<00:22,  2.11it/s]

✅ Saved NHANES_2011-2012_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  71%|███████   | 112/158 [01:03<00:25,  1.81it/s]

✅ Saved NHANES_08_2021-08_2023_Examination_Blood_Pressure_-_Oscillometric_Measurements.csv


Downloading NHANES datasets:  72%|███████▏  | 113/158 [01:03<00:24,  1.86it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  72%|███████▏  | 114/158 [01:04<00:29,  1.48it/s]

✅ Saved NHANES_2017-March_2020_Examination_Blood_Pressure_-_Oscillometric_Measurements.csv


Downloading NHANES datasets:  73%|███████▎  | 115/158 [01:05<00:27,  1.55it/s]

✅ Saved NHANES_2017-2018_Examination_Blood_Pressure_-_Oscillometric_Measurements.csv


Downloading NHANES datasets:  73%|███████▎  | 116/158 [01:05<00:24,  1.71it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  74%|███████▍  | 117/158 [01:06<00:25,  1.58it/s]

✅ Saved NHANES_2013-2014_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  75%|███████▍  | 118/158 [01:06<00:22,  1.78it/s]

✅ Saved NHANES_2015-2016_Laboratory_Glycohemoglobin.csv
✅ Saved NHANES_2017-2018_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  77%|███████▋  | 121/158 [01:07<00:13,  2.68it/s]

✅ Saved NHANES_2003-2004_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  77%|███████▋  | 122/158 [01:08<00:17,  2.00it/s]

✅ Saved NHANES_2011-2012_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  78%|███████▊  | 123/158 [01:08<00:17,  2.02it/s]

✅ Saved NHANES_2009-2010_Laboratory_Trans_Fatty_Acids.csv


Downloading NHANES datasets:  78%|███████▊  | 124/158 [01:09<00:16,  2.09it/s]

✅ Saved NHANES_1999-2000_Laboratory_Trans_Fatty_Acids.csv


Downloading NHANES datasets:  79%|███████▉  | 125/158 [01:09<00:17,  1.87it/s]

✅ Saved NHANES_08_2021-08_2023_Laboratory_Serum_Folate_Forms_-_Total_&_Individual_-_Serum.csv


Downloading NHANES datasets:  80%|███████▉  | 126/158 [01:10<00:15,  2.01it/s]

✅ Saved NHANES_2001-2002_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  80%|████████  | 127/158 [01:11<00:16,  1.88it/s]

✅ Saved NHANES_2011-2012_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv


Downloading NHANES datasets:  81%|████████  | 128/158 [01:11<00:18,  1.61it/s]

✅ Saved NHANES_2007-2008_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  82%|████████▏ | 129/158 [01:12<00:16,  1.80it/s]

✅ Saved NHANES_1999-2000_Laboratory_Glycohemoglobin.csv


Downloading NHANES datasets:  82%|████████▏ | 130/158 [01:13<00:17,  1.61it/s]

✅ Saved NHANES_2017-March_2020_Questionnaire_Weight_History.csv


Downloading NHANES datasets:  83%|████████▎ | 131/158 [01:13<00:18,  1.48it/s]

✅ Saved NHANES_2017-2018_Questionnaire_Weight_History.csv


Downloading NHANES datasets:  84%|████████▎ | 132/158 [01:14<00:19,  1.34it/s]

✅ Saved NHANES_2005-2006_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  84%|████████▍ | 133/158 [01:15<00:20,  1.25it/s]

✅ Saved NHANES_2017-March_2020_Examination_Blood_Pressure_-_Oscillometric_Measurement.csv


Downloading NHANES datasets:  85%|████████▍ | 134/158 [01:16<00:22,  1.08it/s]

✅ Saved NHANES_2009-2010_Examination_Blood_Pressure.csv
✅ Saved NHANES_2015-2016_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv


Downloading NHANES datasets:  86%|████████▌ | 136/158 [01:18<00:17,  1.27it/s]

✅ Saved NHANES_2017-March_2020_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv


Downloading NHANES datasets:  87%|████████▋ | 137/158 [01:18<00:14,  1.41it/s]

✅ Saved NHANES_2017-2018_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv


Downloading NHANES datasets:  88%|████████▊ | 139/158 [01:19<00:11,  1.72it/s]

✅ Saved NHANES_1999-2000_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  89%|████████▊ | 140/158 [01:20<00:11,  1.62it/s]

✅ Saved NHANES_2015-2016_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  89%|████████▉ | 141/158 [01:21<00:12,  1.35it/s]

✅ Saved NHANES_2001-2002_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  90%|████████▉ | 142/158 [01:22<00:11,  1.38it/s]

✅ Saved NHANES_2017-2018_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  91%|█████████ | 143/158 [01:22<00:10,  1.38it/s]

✅ Saved NHANES_2017-March_2020_Examination_Blood_Pressure.csv


Downloading NHANES datasets:  91%|█████████ | 144/158 [01:23<00:08,  1.59it/s]

✅ Saved NHANES_2001-2002_Laboratory_Complete_Blood_Count_with_5-part_Differential_-_Whole_Blood,_Second_Exam.csv
✅ Saved NHANES_2013-2014_Laboratory_Complete_Blood_Count_with_5-part_Differential_-_Whole_Blood.csv


Downloading NHANES datasets:  92%|█████████▏| 145/158 [01:24<00:10,  1.21it/s]

✅ Saved NHANES_08_2021-08_2023_Demographics_Demographic_Variables_and_Sample_Weights.csv


Downloading NHANES datasets:  93%|█████████▎| 147/158 [01:26<00:11,  1.00s/it]

✅ Saved NHANES_2017-March_2020_Demographics_Demographic_Variables_and_Sample_Weights.csv


Downloading NHANES datasets:  94%|█████████▍| 149/158 [01:28<00:08,  1.02it/s]

✅ Saved NHANES_2017-2018_Demographics_Demographic_Variables_and_Sample_Weights.csv


Downloading NHANES datasets:  95%|█████████▍| 150/158 [01:30<00:09,  1.19s/it]

✅ Saved NHANES_2015-2016_Demographics_Demographic_Variables_and_Sample_Weights.csv


Downloading NHANES datasets:  96%|█████████▌| 151/158 [01:32<00:08,  1.27s/it]

✅ Saved NHANES_2013-2014_Demographics_Demographic_Variables_and_Sample_Weights.csv


Downloading NHANES datasets:  96%|█████████▌| 152/158 [01:33<00:07,  1.28s/it]

✅ Saved NHANES_2011-2012_Demographics_Demographic_Variables_&_Sample_Weights.csv


Downloading NHANES datasets:  97%|█████████▋| 153/158 [01:34<00:06,  1.28s/it]

✅ Saved NHANES_2009-2010_Demographics_Demographic_Variables_&_Sample_Weights.csv
✅ Saved NHANES_2007-2008_Demographics_Demographic_Variables_&_Sample_Weights.csv


Downloading NHANES datasets:  98%|█████████▊| 155/158 [01:37<00:03,  1.31s/it]

✅ Saved NHANES_2005-2006_Demographics_Demographic_Variables_&_Sample_Weights.csv


Downloading NHANES datasets:  99%|█████████▊| 156/158 [01:38<00:02,  1.30s/it]

✅ Saved NHANES_2003-2004_Demographics_Demographic_Variables_&_Sample_Weights.csv
✅ Saved NHANES_2001-2002_Demographics_Demographic_Variables_&_Sample_Weights.csv


Downloading NHANES datasets: 100%|██████████| 158/158 [01:46<00:00,  2.88s/it]

✅ Saved NHANES_1999-2000_Demographics_Demographic_Variables_&_Sample_Weights.csv


Downloading NHANES datasets: 100%|██████████| 158/158 [01:46<00:00,  1.48it/s]


✅ Download complete. Files saved in 'nhanes_data'





In [None]:
import shutil

shutil.rmtree("nhanes_data", ignore_errors=True)
print("✅ Deleted 'nhanes_data' folder and all its contents.")


✅ Deleted 'nhanes_data' folder and all its contents.


In [None]:
import pandas as pd
import os
import re
from tqdm import tqdm

DATA_DIR = "nhanes_data"
OUTPUT_FILE = "nhanes_patient_flattened.csv"
SUMMARY_FILE = "nhanes_column_summary.csv"

# Columns to drop before merging (metadata)
METADATA_COLS = ['Dataset_Name', 'Doc_URL', 'Component', 'NHANES_Cycle']

# Track files missing SEQN
missing_seqn_files = []

# Function to extract NHANES cycle from filename
def extract_cycle(filename):
    match = re.search(r'(\d{4})[-_](\d{4})', filename)
    if match:
        return f"{match.group(1)}-{match.group(2)}"
    numbers = re.findall(r'\d{4}', filename)
    if len(numbers) >= 2:
        return f"{numbers[0]}-{numbers[1]}"
    return "unknown"

# 1️⃣ Group datasets by NHANES cycle
cycle_files = {}
for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):
        cycle = extract_cycle(file)
        cycle_files.setdefault(cycle, []).append(file)

# 2️⃣ Process each cycle
flattened_dfs = []
missing_datasets = {}
column_sources = []  # Track each column's origin

for cycle, files in tqdm(cycle_files.items(), desc="Processing NHANES cycles"):
    missing_datasets[cycle] = []

    # Find demographics file
    demog_file = [f for f in files if "Demographics" in f]
    if not demog_file:
        print(f"⚠️ No demographics file found for cycle {cycle}. Skipping cycle.")
        missing_datasets[cycle].append("Demographics")
        continue

    # Read demographics
    df_cycle = pd.read_csv(os.path.join(DATA_DIR, demog_file[0]), low_memory=False)
    print(f"✅ Demographics loaded for {cycle}: {len(df_cycle)} patients, {len(df_cycle.columns)} columns")

    # Track column origins for demographics
    for col in df_cycle.columns:
        column_sources.append({
            "Column": col,
            "Cycle": cycle,
            "Source_File": demog_file[0]
        })

    # Merge all other datasets
    for f in files:
        if f == demog_file[0]:
            continue

        df_other = pd.read_csv(os.path.join(DATA_DIR, f), low_memory=False)

        if 'SEQN' not in df_other.columns:
            print(f"⚠️ {f} has no SEQN. Skipping merge.")
            missing_datasets[cycle].append(f)
            missing_seqn_files.append(f"{cycle}: {f}")
            continue

        # Drop metadata columns
        df_other_clean = df_other.drop(columns=[c for c in METADATA_COLS if c in df_other.columns])

        # Rename columns to avoid collisions
        rename_map = {col: f"{col}_{os.path.splitext(f)[0]}" for col in df_other_clean.columns if col != 'SEQN'}
        df_other_clean = df_other_clean.rename(columns=rename_map)

        # Track column origins
        for col in df_other_clean.columns:
            if col != 'SEQN':
                column_sources.append({
                    "Column": col,
                    "Cycle": cycle,
                    "Source_File": f
                })

        try:
            df_cycle = df_cycle.merge(df_other_clean, on="SEQN", how="left")
            print(f"🔹 Merged {f} into {cycle}: +{len(rename_map)} columns")
        except Exception as e:
            print(f"⚠️ Failed to merge {f} into {cycle}: {e}")
            missing_datasets[cycle].append(f)

    flattened_dfs.append(df_cycle)

# 3️⃣ Combine all cycles
all_patients_df = pd.concat(flattened_dfs, ignore_index=True, sort=False)

# 4️⃣ Save flattened dataset
all_patients_df.to_csv(OUTPUT_FILE, index=False)
print(f"\n✅ Flattened dataset ready: {OUTPUT_FILE}")
print(f"Shape: {all_patients_df.shape[0]} patients, {all_patients_df.shape[1]} variables")

# 5️⃣ Save column summary
df_summary = pd.DataFrame(column_sources)
df_summary.to_csv(SUMMARY_FILE, index=False)
print(f"📊 Column summary saved: {SUMMARY_FILE}")

# 6️⃣ Report missing datasets
print("\n📋 Missing datasets per cycle:")
for cycle, missing in missing_datasets.items():
    if missing:
        print(f"- {cycle}: {missing}")

# 7️⃣ Report files skipped due to missing SEQN
if missing_seqn_files:
    print("\n⚠️ Files skipped due to missing SEQN:")
    for f in missing_seqn_files:
        print(f"  - {f}")







Processing NHANES cycles:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Demographics loaded for 2017-2018: 9254 patients, 50 columns
🔹 Merged NHANES_2017-2018_Laboratory_Plasma_Fasting_Glucose.csv into 2017-2018: +3 columns
🔹 Merged NHANES_2017-2018_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2017-2018: +10 columns
🔹 Merged NHANES_2017-2018_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv into 2017-2018: +15 columns
🔹 Merged NHANES_2017-2018_Laboratory_Cholesterol_-_Total.csv into 2017-2018: +2 columns
🔹 Merged NHANES_2017-2018_Laboratory_Fasting_Questionnaire.csv into 2017-2018: +18 columns
🔹 Merged NHANES_2017-2018_Questionnaire_Diabetes.csv into 2017-2018: +53 columns


Processing NHANES cycles:   8%|▊         | 1/12 [00:00<00:05,  2.15it/s]

🔹 Merged NHANES_2017-2018_Examination_Blood_Pressure.csv into 2017-2018: +20 columns
🔹 Merged NHANES_2017-2018_Laboratory_Cholesterol_-_Low-Density_Lipoproteins_(LDL)_&_Triglycerides.csv into 2017-2018: +9 columns
🔹 Merged NHANES_2017-2018_Laboratory_Glycohemoglobin.csv into 2017-2018: +1 columns
🔹 Merged NHANES_2017-2018_Questionnaire_Weight_History.csv into 2017-2018: +36 columns
🔹 Merged NHANES_2017-2018_Examination_Blood_Pressure_-_Oscillometric_Measurements.csv into 2017-2018: +12 columns
🔹 Merged NHANES_2017-2018_Laboratory_Insulin.csv into 2017-2018: +4 columns
🔹 Merged NHANES_2017-2018_Laboratory_Cholesterol_-_High_-_Density_Lipoprotein_(HDL).csv into 2017-2018: +2 columns
✅ Demographics loaded for 2021-2023: 11933 patients, 31 columns
🔹 Merged NHANES_08_2021-08_2023_Examination_Blood_Pressure_-_Oscillometric_Measurements.csv into 2021-2023: +11 columns
🔹 Merged NHANES_08_2021-08_2023_Laboratory_Glycohemoglobin.csv into 2021-2023: +2 columns
🔹 Merged NHANES_08_2021-08_2023_Ques

Processing NHANES cycles:  17%|█▋        | 2/12 [00:00<00:03,  2.53it/s]

🔹 Merged NHANES_08_2021-08_2023_Laboratory_Cholesterol_–_High-Density_Lipoprotein.csv into 2021-2023: +3 columns
🔹 Merged NHANES_08_2021-08_2023_Laboratory_Serum_Folate_Forms_-_Total_&_Individual_-_Serum.csv into 2021-2023: +15 columns
🔹 Merged NHANES_08_2021-08_2023_Laboratory_Cholesterol_-_Low-Density_Lipoproteins_(LDL)_&_Triglycerides.csv into 2021-2023: +9 columns
✅ Demographics loaded for 2017-2020: 9254 patients, 50 columns
🔹 Merged NHANES_2017-March_2020_Questionnaire_Diabetes.csv into 2017-2020: +53 columns
🔹 Merged NHANES_2017-March_2020_Laboratory_Cholesterol_-_High_-_Density_Lipoprotein_(HDL).csv into 2017-2020: +2 columns
🔹 Merged NHANES_2017-March_2020_Laboratory_Fasting_Questionnaire.csv into 2017-2020: +18 columns
🔹 Merged NHANES_2017-March_2020_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv into 2017-2020: +15 columns
🔹 Merged NHANES_2017-March_2020_Questionnaire_Weight_History.csv into 2017-2020: +36 columns
🔹 Merged NHANES_2017-March_2020_Laboratory_Glycohem

Processing NHANES cycles:  25%|██▌       | 3/12 [00:01<00:04,  2.13it/s]

🔹 Merged NHANES_2017-March_2020_Laboratory_Cholesterol_-_Total.csv into 2017-2020: +2 columns
🔹 Merged NHANES_2017-March_2020_Examination_Blood_Pressure_-_Oscillometric_Measurements.csv into 2017-2020: +12 columns
🔹 Merged NHANES_2017-March_2020_Examination_Blood_Pressure.csv into 2017-2020: +20 columns
🔹 Merged NHANES_2017-March_2020_Laboratory_Insulin.csv into 2017-2020: +4 columns
🔹 Merged NHANES_2017-March_2020_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2017-2020: +10 columns
✅ Demographics loaded for 2007-2008: 10149 patients, 47 columns
🔹 Merged NHANES_2007-2008_Laboratory_Glycohemoglobin.csv into 2007-2008: +1 columns
🔹 Merged NHANES_2007-2008_Examination_Blood_Pressure.csv into 2007-2008: +26 columns
🔹 Merged NHANES_2007-2008_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2007-2008: +18 columns
🔹 Merged NHANES_2007-2008_Laboratory_Cholesterol_-_HDL.csv into 2007-2008: +2 columns
🔹 Merged NHANES_2007-2008_Laboratory_Cholesterol_-_Total.csv into 2007-2008: +2 column

Processing NHANES cycles:  33%|███▎      | 4/12 [00:01<00:03,  2.38it/s]

🔹 Merged NHANES_2007-2008_Laboratory_Oral_Glucose_Tolerance_Test.csv into 2007-2008: +11 columns
🔹 Merged NHANES_2007-2008_Laboratory_Fasting_Questionnaire.csv into 2007-2008: +18 columns
🔹 Merged NHANES_2007-2008_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 2007-2008: +5 columns
✅ Demographics loaded for 2005-2006: 10348 patients, 47 columns
🔹 Merged NHANES_2005-2006_Laboratory_Cholesterol_-_Total.csv into 2005-2006: +2 columns
🔹 Merged NHANES_2005-2006_Examination_Blood_Pressure.csv into 2005-2006: +27 columns
🔹 Merged NHANES_2005-2006_Laboratory_Glycohemoglobin.csv into 2005-2006: +1 columns


Processing NHANES cycles:  42%|████▏     | 5/12 [00:02<00:02,  2.57it/s]

🔹 Merged NHANES_2005-2006_Questionnaire_Diabetes.csv into 2005-2006: +35 columns
🔹 Merged NHANES_2005-2006_Laboratory_Cholesterol_-_LDL,_Triglyceride_&_Apoliprotein_(ApoB).csv into 2005-2006: +7 columns
🔹 Merged NHANES_2005-2006_Laboratory_Cholesterol_-_HDL.csv into 2005-2006: +2 columns
🔹 Merged NHANES_2005-2006_Laboratory_Plasma_Fasting_Glucose_&_Insulin.csv into 2005-2006: +7 columns
🔹 Merged NHANES_2005-2006_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2005-2006: +15 columns
🔹 Merged NHANES_2005-2006_Laboratory_Oral_Glucose_Tolerance_Test.csv into 2005-2006: +5 columns
🔹 Merged NHANES_2005-2006_Laboratory_Fasting_Questionnaire.csv into 2005-2006: +18 columns
✅ Demographics loaded for 2009-2010: 10537 patients, 47 columns
🔹 Merged NHANES_2009-2010_Laboratory_Glycohemoglobin.csv into 2009-2010: +1 columns
🔹 Merged NHANES_2009-2010_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 2009-2010: +5 columns
🔹 Merged NHANES_2009-2010_Laboratory_Oral_Glucose_Tolerance_Test.csv int

Processing NHANES cycles:  50%|█████     | 6/12 [00:02<00:02,  2.28it/s]

🔹 Merged NHANES_2009-2010_Questionnaire_Diabetes.csv into 2009-2010: +19 columns
✅ Demographics loaded for 2011-2012: 9756 patients, 52 columns
🔹 Merged NHANES_2011-2012_Examination_Blood_Pressure.csv into 2011-2012: +26 columns
🔹 Merged NHANES_2011-2012_Questionnaire_Diabetes.csv into 2011-2012: +52 columns
🔹 Merged NHANES_2011-2012_Laboratory_Glycohemoglobin.csv into 2011-2012: +1 columns
🔹 Merged NHANES_2011-2012_Laboratory_Cholesterol_-_Total.csv into 2011-2012: +2 columns
🔹 Merged NHANES_2011-2012_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv into 2011-2012: +14 columns
🔹 Merged NHANES_2011-2012_Laboratory_Plasma_Fasting_Glucose_&_Insulin.csv into 2011-2012: +7 columns
🔹 Merged NHANES_2011-2012_Laboratory_Oral_Glucose_Tolerance_Test.csv into 2011-2012: +11 columns
🔹 Merged NHANES_2011-2012_Laboratory_Fatty_Acids_-_Serum.csv into 2011-2012: +61 columns
🔹 Merged NHANES_2011-2012_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 2011-2012: +5 columns
🔹 Merged NHANES_20

Processing NHANES cycles:  58%|█████▊    | 7/12 [00:03<00:02,  1.87it/s]

🔹 Merged NHANES_2011-2012_Laboratory_Cholesterol_-_HDL.csv into 2011-2012: +2 columns
✅ Demographics loaded for 2001-2002: 11039 patients, 41 columns
🔹 Merged NHANES_2001-2002_Laboratory_Glycohemoglobin,_Plasma_Glucose,_Serum_C-peptide,_&_Insulin,_Second_Exam.csv into 2001-2002: +8 columns
🔹 Merged NHANES_2001-2002_Examination_Blood_Pressure.csv into 2001-2002: +29 columns
🔹 Merged NHANES_2001-2002_Laboratory_Glycohemoglobin.csv into 2001-2002: +1 columns
🔹 Merged NHANES_2001-2002_Laboratory_Complete_Blood_Count_with_5-part_Differential_-_Whole_Blood,_Second_Exam.csv into 2001-2002: +21 columns
🔹 Merged NHANES_2001-2002_Laboratory_Cholesterol_-_Total_&_HDL.csv into 2001-2002: +4 columns
🔹 Merged NHANES_2001-2002_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 2001-2002: +6 columns
🔹 Merged NHANES_2001-2002_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2001-2002: +35 columns
🔹 Merged NHANES_2001-2002_Laboratory_Cholesterol_-_Total,_HDL,_LDL__&_Triglycerides,_Second_Exam.csv 

Processing NHANES cycles:  67%|██████▋   | 8/12 [00:03<00:02,  1.85it/s]

🔹 Merged NHANES_2001-2002_Laboratory_Fasting_Questionnaire.csv into 2001-2002: +18 columns
🔹 Merged NHANES_2001-2002_Laboratory_Plasma_Fasting_Glucose,_Serum_C-peptide_&_Insulin.csv into 2001-2002: +7 columns
🔹 Merged NHANES_2001-2002_Questionnaire_Diabetes.csv into 2001-2002: +16 columns
✅ Demographics loaded for 2003-2004: 10122 patients, 48 columns
🔹 Merged NHANES_2003-2004_Laboratory_Cholesterol_-_Total_&_HDL.csv into 2003-2004: +4 columns
🔹 Merged NHANES_2003-2004_Laboratory_Plasma_Fasting_Glucose,_Serum_C-peptide_&_Insulin.csv into 2003-2004: +6 columns
🔹 Merged NHANES_2003-2004_Questionnaire_Diabetes.csv into 2003-2004: +16 columns
🔹 Merged NHANES_2003-2004_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 2003-2004: +5 columns
🔹 Merged NHANES_2003-2004_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2003-2004: +35 columns
🔹 Merged NHANES_2003-2004_Laboratory_Glycohemoglobin.csv into 2003-2004: +1 columns


Processing NHANES cycles:  75%|███████▌  | 9/12 [00:04<00:01,  2.00it/s]

🔹 Merged NHANES_2003-2004_Laboratory_Fasting_Questionnaire.csv into 2003-2004: +18 columns
🔹 Merged NHANES_2003-2004_Laboratory_Fatty_Acids_-_Plasma_(Surplus).csv into 2003-2004: +25 columns
✅ Demographics loaded for 2015-2016: 9971 patients, 51 columns
🔹 Merged NHANES_2015-2016_Laboratory_Cholesterol_-_High-Density_Lipoprotein_(HDL).csv into 2015-2016: +2 columns
🔹 Merged NHANES_2015-2016_Laboratory_Cholesterol_-_Total.csv into 2015-2016: +2 columns
🔹 Merged NHANES_2015-2016_Laboratory_Glycohemoglobin.csv into 2015-2016: +1 columns
🔹 Merged NHANES_2015-2016_Laboratory_Oral_Glucose_Tolerance_Test.csv into 2015-2016: +9 columns
🔹 Merged NHANES_2015-2016_Laboratory_Fasting_Questionnaire.csv into 2015-2016: +18 columns
🔹 Merged NHANES_2015-2016_Questionnaire_Diabetes.csv into 2015-2016: +53 columns
🔹 Merged NHANES_2015-2016_Laboratory_Insulin.csv into 2015-2016: +6 columns


Processing NHANES cycles:  83%|████████▎ | 10/12 [00:04<00:01,  1.88it/s]

🔹 Merged NHANES_2015-2016_Examination_Blood_Pressure.csv into 2015-2016: +20 columns
🔹 Merged NHANES_2015-2016_Laboratory_Cholesterol_-_Low_-_Density_Lipoprotein_(LDL)_&_Triglycerides.csv into 2015-2016: +5 columns
🔹 Merged NHANES_2015-2016_Laboratory_Plasma_Fasting_Glucose.csv into 2015-2016: +3 columns
🔹 Merged NHANES_2015-2016_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv into 2015-2016: +14 columns
🔹 Merged NHANES_2015-2016_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2015-2016: +10 columns
✅ Demographics loaded for 1999-2000: 9965 patients, 148 columns
🔹 Merged NHANES_1999-2000_Examination_Blood_Pressure.csv into 1999-2000: +29 columns
🔹 Merged NHANES_1999-2000_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 1999-2000: +35 columns
🔹 Merged NHANES_1999-2000_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 1999-2000: +6 columns
🔹 Merged NHANES_1999-2000_Laboratory_Glycohemoglobin.csv into 1999-2000: +1 columns
🔹 Merged NHANES_1999-2000_Laboratory_Trans_Fat

Processing NHANES cycles:  92%|█████████▏| 11/12 [00:06<00:00,  1.31it/s]

🔹 Merged NHANES_1999-2000_Laboratory_Plasma_Fasting_Glucose,_Serum_C-peptide_&_Insulin.csv into 1999-2000: +7 columns
✅ Demographics loaded for 2013-2014: 10175 patients, 51 columns
🔹 Merged NHANES_2013-2014_Laboratory_Plasma_Fasting_Glucose.csv into 2013-2014: +5 columns
🔹 Merged NHANES_2013-2014_Laboratory_Fasting_Questionnaire.csv into 2013-2014: +18 columns
🔹 Merged NHANES_2013-2014_Laboratory_Folate_Forms_-_Total_&_Individual_-_Serum.csv into 2013-2014: +14 columns
🔹 Merged NHANES_2013-2014_Laboratory_Complete_Blood_Count_with_5-part_Differential_-_Whole_Blood.csv into 2013-2014: +20 columns
🔹 Merged NHANES_2013-2014_Laboratory_Cholesterol_-_HDL.csv into 2013-2014: +2 columns
🔹 Merged NHANES_2013-2014_Questionnaire_Blood_Pressure_&_Cholesterol.csv into 2013-2014: +13 columns
🔹 Merged NHANES_2013-2014_Questionnaire_Diabetes.csv into 2013-2014: +53 columns
🔹 Merged NHANES_2013-2014_Laboratory_Cholesterol_-_LDL_&_Triglycerides.csv into 2013-2014: +5 columns
🔹 Merged NHANES_2013-2014_

Processing NHANES cycles: 100%|██████████| 12/12 [00:06<00:00,  1.74it/s]

🔹 Merged NHANES_2013-2014_Laboratory_Cholesterol_-_Total.csv into 2013-2014: +2 columns
🔹 Merged NHANES_2013-2014_Laboratory_Insulin.csv into 2013-2014: +5 columns






✅ Flattened dataset ready: nhanes_patient_flattened.csv
Shape: 122503 patients, 1983 variables
📊 Column summary saved: nhanes_column_summary.csv

📋 Missing datasets per cycle:


In [None]:
!pip install duckdb psutil
import duckdb
import pandas as pd
import time
import os
import re
from datetime import datetime

# ========== CONFIGURATION ==========
INPUT_FILE = "nhanes_patient_flattened.csv"
OUTPUT_FILE = "nhanes_wide_aggregated.csv"

# ========== SETUP ==========
print("=" * 60)
print("🚀 NHANES Wide Format Aggregation (One Row Per Patient)")
print("=" * 60)
print(f"Input:  {INPUT_FILE}")
print(f"Output: {OUTPUT_FILE}")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

start_time = time.time()

# ========== STEP 1: Check Input File ==========
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"❌ Input file not found: {INPUT_FILE}")

file_size_mb = os.path.getsize(INPUT_FILE) / (1024 ** 2)
print(f"📁 Input file size: {file_size_mb:.2f} MB\n")

# ========== STEP 2: Analyze Column Structure ==========
print("🔍 Analyzing column structure...")
con = duckdb.connect()

# Load first row to understand structure
sample = con.execute(f"""
    SELECT * FROM read_csv('{INPUT_FILE}', AUTO_DETECT=TRUE, SAMPLE_SIZE=1000)
    LIMIT 1
""").df()

print(f"✅ Found {len(sample.columns)} total columns\n")

# ========== STEP 3: Extract Cycle and Base Variable from Column Names ==========
print("🔨 Extracting base variables and cycle information...")

def parse_column_name(col):
    """
    Parse column like: LBXGLU_NHANES_2017-2018_Laboratory_Glucose
    Returns: (base_variable, cycle, full_name)
    """
    if col == 'SEQN':
        return ('SEQN', None, 'SEQN')

    # Try to extract cycle (e.g., 2017-2018)
    cycle_match = re.search(r'(\d{4})[-_](\d{4})', col)
    cycle = f"{cycle_match.group(1)}-{cycle_match.group(2)}" if cycle_match else 'unknown'

    # Base variable is the first part before underscore
    parts = col.split('_')
    base_var = parts[0] if parts else col

    return (base_var, cycle, col)

# Parse all columns
column_info = {}
for col in sample.columns:
    if col == 'SEQN':
        continue
    base_var, cycle, full_name = parse_column_name(col)

    if base_var not in column_info:
        column_info[base_var] = []
    column_info[base_var].append({
        'full_name': full_name,
        'cycle': cycle
    })

print(f"✅ Identified {len(column_info)} unique base variables")
print(f"   Example: {list(column_info.keys())[:5]}\n")

# ========== STEP 4: Build Aggregation Query ==========
print("⚙️  Building aggregation query...")
print("   Strategy: Group by SEQN, concatenate values with cycle info\n")

# Build COALESCE statements for each base variable
# Format: "value1 [cycle1], value2 [cycle2]"
agg_statements = []

for base_var, columns in column_info.items():
    if len(columns) == 1:
        # Only one column for this variable - just select it
        col = columns[0]['full_name']
        agg_statements.append(f'"{col}" AS "{base_var}"')
    else:
        # Multiple columns - concatenate with cycle info
        # Build: CONCAT_WS(', ',
        #          CASE WHEN col1 IS NOT NULL THEN col1 || ' [cycle1]' END,
        #          CASE WHEN col2 IS NOT NULL THEN col2 || ' [cycle2]' END)
        concat_parts = []
        for col_info in columns:
            col = col_info['full_name']
            cycle = col_info['cycle']
            concat_parts.append(
                f'CASE WHEN "{col}" IS NOT NULL THEN CAST("{col}" AS VARCHAR) || \' [{cycle}]\' END'
            )

        concat_expr = f"CONCAT_WS(', ', {', '.join(concat_parts)})"
        agg_statements.append(f'{concat_expr} AS "{base_var}"')

# Build final SELECT statement
select_clause = "SEQN, " + ", ".join(agg_statements)

query = f"""
    COPY (
        SELECT {select_clause}
        FROM read_csv('{INPUT_FILE}',
                     AUTO_DETECT=TRUE,
                     SAMPLE_SIZE=50000,
                     IGNORE_ERRORS=TRUE)
        ORDER BY SEQN
    ) TO '{OUTPUT_FILE}' (HEADER, DELIMITER ',')
"""

# ========== STEP 5: Execute Transformation ==========
print("⚙️  Executing transformation...")
print("    (This may take 10-30 minutes depending on dataset size)")
print("    Processing all patients into single-row format...\n")

try:
    con.execute(query)
    transform_time = time.time() - start_time
    print(f"✅ Transformation complete in {transform_time/60:.2f} minutes\n")

    # ========== STEP 6: Get Statistics ==========
    print("📊 Analyzing results...")

    result_df = pd.read_csv(OUTPUT_FILE, nrows=5)

    print(f"   Output columns: {len(result_df.columns)}")
    print(f"   (One column per unique base variable)\n")

    output_size_mb = os.path.getsize(OUTPUT_FILE) / (1024 ** 2)
    print(f"💾 Output file size: {output_size_mb:.2f} MB\n")

    # ========== STEP 7: Show Sample Data ==========
    print("🔬 Sample of first patient (first 5 columns):")
    print(result_df.iloc[0, :5].to_string())
    print("\n📋 Example of multi-cycle values:")
    # Find a column with commas (multiple values)
    for col in result_df.columns[1:20]:  # Check first 20 columns
        val = result_df.iloc[0][col]
        if isinstance(val, str) and ',' in val:
            print(f"   {col}: {val[:100]}...")
            break
    print()

    # ========== STEP 8: Memory Usage Report ==========
    import psutil
    process = psutil.Process()
    memory_mb = process.memory_info().rss / (1024 ** 2)
    print(f"💾 Peak memory usage: {memory_mb:.2f} MB")

except Exception as e:
    print(f"❌ Error during transformation: {e}")
    import traceback
    traceback.print_exc()
    raise

finally:
    con.close()

# ========== SUMMARY ==========
total_time = time.time() - start_time
print("\n" + "=" * 60)
print("✅ TRANSFORMATION COMPLETE!")
print("=" * 60)
print(f"⏱️  Total time: {total_time/60:.2f} minutes")
print(f"📁 Output file: {OUTPUT_FILE}")
print(f"📊 Format: One row per patient (SEQN)")
print(f"📋 Columns: {len(result_df.columns)} unique variables")
print(f"💡 Multi-cycle values formatted as: value1 [2017-2018], value2 [2019-2020]")
print("=" * 60)

print("\n💡 Excel Tips:")
print("   - This format is Excel-friendly (manageable column count)")
print("   - Each cell contains all values for that patient/variable")
print("   - Cycle info preserved in brackets: [2017-2018]")
print("   - AI can parse: 'value [cycle], value [cycle]' format")






🚀 NHANES Wide Format Aggregation (One Row Per Patient)
Input:  nhanes_patient_flattened.csv
Output: nhanes_wide_aggregated.csv
Started: 2025-10-20 17:42:03

📁 Input file size: 304.39 MB

🔍 Analyzing column structure...
✅ Found 1983 total columns

🔨 Extracting base variables and cycle information...
✅ Identified 607 unique base variables
   Example: ['SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN']

⚙️  Building aggregation query...
   Strategy: Group by SEQN, concatenate values with cycle info

⚙️  Executing transformation...
    (This may take 10-30 minutes depending on dataset size)
    Processing all patients into single-row format...



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Transformation complete in 0.56 minutes

📊 Analyzing results...
   Output columns: 608
   (One column per unique base variable)

💾 Output file size: 279.89 MB

🔬 Sample of first patient (first 5 columns):
SEQN        1.0
SDDSRVYR    1.0
RIDSTATR    2.0
RIAGENDR    2.0
RIDAGEYR    2.0

📋 Example of multi-cycle values:

💾 Peak memory usage: 4828.82 MB

✅ TRANSFORMATION COMPLETE!
⏱️  Total time: 0.57 minutes
📁 Output file: nhanes_wide_aggregated.csv
📊 Format: One row per patient (SEQN)
📋 Columns: 608 unique variables
💡 Multi-cycle values formatted as: value1 [2017-2018], value2 [2019-2020]

💡 Excel Tips:
   - This format is Excel-friendly (manageable column count)
   - Each cell contains all values for that patient/variable
   - Cycle info preserved in brackets: [2017-2018]
   - AI can parse: 'value [cycle], value [cycle]' format


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import json
import os

# Configuration
WIDE_FILE = "nhanes_wide_aggregated.csv"
OUTPUT_DICT = "nhanes_auto_dictionary.csv"
CACHE_FILE = "nhanes_scrape_cache.json"
DATA_DIR = "nhanes_data"

print("=" * 70)
print("SIMPLE NHANES DICTIONARY SCRAPER")
print("=" * 70)

# Step 1: Get list of variables we need
print("\nStep 1: Loading variable list from wide file...")
wide_df = pd.read_csv(WIDE_FILE, nrows=0)
needed_vars = set()
for col in wide_df.columns:
    base_var = col.split('_')[0] if '_' in col else col
    needed_vars.add(base_var)

print(f"   Need definitions for {len(needed_vars)} variables")
print()

# Step 2: Find URLs
print("Step 2: Finding documentation URLs...")
doc_urls = []

if os.path.exists(DATA_DIR):
    for csv_file in os.listdir(DATA_DIR):
        if csv_file.endswith('.csv'):
            try:
                df = pd.read_csv(os.path.join(DATA_DIR, csv_file), nrows=1)
                for col in ['Doc', 'Doc_URL', 'doc']:
                    if col in df.columns and not df[col].isna().all():
                        url = df[col].iloc[0]
                        if url and str(url).startswith('http') and url not in doc_urls:
                            doc_urls.append(url)
                        break
            except:
                continue

print(f"   Found {len(doc_urls)} URLs")
print()

# Step 3: Load cache
cache = {}
try:
    with open(CACHE_FILE, 'r') as f:
        cache = json.load(f)
    print(f"Loaded cache with {len(cache)} pages")
except:
    print("No cache found")

print()

# Step 4: Simple scraping approach
def simple_scrape(url):
    """Dead simple: extract all text and find variable descriptions"""

    if url in cache:
        return cache[url]

    try:
        time.sleep(1)
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get all text from the page
        page_text = soup.get_text()

        # Split into lines
        lines = [line.strip() for line in page_text.split('\n') if line.strip()]

        variables = {}

        # Look for patterns like:
        # "VARNAME - Description"
        # "Variable Name: VARNAME"
        # "SAS Label: Description"
        # "English Text: Description"

        i = 0
        while i < len(lines):
            line = lines[i]

            # Pattern 1: "VARNAME - Description" (in TOC or heading)
            match = re.match(r'^([A-Z][A-Z0-9_]{2,15})\s*[-–—]\s*(.+)$', line)
            if match:
                var_name = match.group(1).strip()
                var_desc = match.group(2).strip()

                if var_name not in variables:
                    variables[var_name] = {
                        'description': var_desc,
                        'source_url': url
                    }

            # Pattern 2: "Variable Name: VARNAME" followed by description
            if 'Variable Name:' in line:
                match = re.search(r'Variable Name:\s*([A-Z][A-Z0-9_]+)', line, re.IGNORECASE)
                if match:
                    var_name = match.group(1).strip()
                    var_desc = None

                    # Look ahead for SAS Label or English Text
                    for j in range(i+1, min(i+10, len(lines))):
                        next_line = lines[j]

                        # Get English Text (preferred)
                        if 'English Text:' in next_line:
                            desc_match = re.search(r'English Text:\s*(.+)', next_line, re.IGNORECASE)
                            if desc_match:
                                var_desc = desc_match.group(1).strip()
                                break

                        # Get SAS Label (fallback)
                        if not var_desc and 'SAS Label:' in next_line:
                            desc_match = re.search(r'SAS Label:\s*(.+)', next_line, re.IGNORECASE)
                            if desc_match:
                                var_desc = desc_match.group(1).strip()

                    if var_name and var_desc:
                        variables[var_name] = {
                            'description': var_desc,
                            'source_url': url
                        }

            i += 1

        cache[url] = variables
        return variables

    except Exception as e:
        print(f"Error: {url} - {e}")
        return {}

# Step 5: Scrape all URLs
print("Step 3: Scraping (simple text extraction)...")
all_variables = {}

for url in tqdm(doc_urls, desc="Scraping"):
    vars_found = simple_scrape(url)
    all_variables.update(vars_found)

# Save cache
with open(CACHE_FILE, 'w') as f:
    json.dump(cache, f, indent=2)

print(f"\nFound {len(all_variables)} variable definitions")
print()

# Step 6: Map to columns
print("Step 4: Creating dictionary...")
records = []

for col in wide_df.columns:
    base_var = col.split('_')[0] if '_' in col else col

    if base_var in all_variables:
        info = all_variables[base_var]
        records.append({
            'column_name': col,
            'base_variable': base_var,
            'description': info['description'],
            'source_url': info['source_url'],
            'data_source': 'CDC Documentation'
        })
    else:
        records.append({
            'column_name': col,
            'base_variable': base_var,
            'description': f'NHANES variable {base_var}',
            'source_url': '',
            'data_source': 'Auto-generated'
        })

dict_df = pd.DataFrame(records)

# Save
dict_df.to_csv(OUTPUT_DICT, index=False)

print(f"Saved: {OUTPUT_DICT}")
print(f"   Total: {len(dict_df)} columns")
print(f"   From CDC: {len(dict_df[dict_df['data_source']=='CDC Documentation'])}")
print(f"   Coverage: {len(dict_df[dict_df['data_source']=='CDC Documentation']) / len(dict_df) * 100:.1f}%")
print()

# Show samples
print("Sample descriptions:")
print("-" * 70)
samples = dict_df[dict_df['data_source']=='CDC Documentation'].head(15)
for _, row in samples.iterrows():
    print(f"{row['base_variable']:12} : {row['description'][:80]}")

print("\n" + "=" * 70)
print("DONE!")
print("=" * 70)

SIMPLE NHANES DICTIONARY SCRAPER

Step 1: Loading variable list from wide file...
   Need definitions for 608 variables

Step 2: Finding documentation URLs...
   Found 134 URLs

Loaded cache with 134 pages

Step 3: Scraping (simple text extraction)...


Scraping: 100%|██████████| 134/134 [00:00<00:00, 54329.31it/s]


Found 605 variable definitions

Step 4: Creating dictionary...
Saved: nhanes_auto_dictionary.csv
   Total: 608 columns
   From CDC: 1
   Coverage: 0.2%

Sample descriptions:
----------------------------------------------------------------------
DMDEDUC      : Respondent sequence number.Target:Both males and females 0 YEARS -

DONE!



