<a href="https://colab.research.google.com/github/ConstructoDestructo/Diabetes_AI_Instrument/blob/main/BRFSS_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
BRFSS DATA EXTRACTOR - FINAL COMPLETE VERSION
==============================================
Extracts BRFSS data (2011-2024) with complete documentation parsing.

Features:
- Downloads BRFSS data for any year
- Parses HTML codebooks (2019, 2022-2024)
- Parses PDF codebooks (2011-2018, 2020-2021)
- Outputs NHANES-format dictionary with real descriptions
- Extracts value codes (1=Yes, 2=No, etc.)
- Handles duplicates automatically
- Year-at-a-time processing to avoid RAM issues

Usage:
    # Extract 2019 data
    extractor = BRFSSExtractor(start_year=2019, end_year=2019,
                               output_dir='brfss_2019')
    data, dictionary, value_codes = extractor.extract_all()

    # With manual codebook (if CDC blocks automated download)
    extractor.extract_with_manual_codebook('codeBook2019.html')
"""

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import time
from tqdm import tqdm
from urllib.parse import urljoin
import zipfile
import io
import warnings
warnings.filterwarnings('ignore')

try:
    import pdfplumber
except ImportError:
    print("⚠ pdfplumber not installed. Installing...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'pdfplumber', '-q'])
    import pdfplumber


class BRFSSExtractor:
    """Extract BRFSS data with complete documentation."""

    def __init__(self, start_year=2019, end_year=2019, output_dir='extracted_brfss'):
        self.base_url = "https://www.cdc.gov/brfss/"
        self.start_year = start_year
        self.end_year = end_year
        self.output_dir = output_dir

        os.makedirs(output_dir, exist_ok=True)

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        }

        print("=" * 80)
        print("🔍 BRFSS DATA EXTRACTOR - FINAL COMPLETE VERSION")
        print("=" * 80)
        print(f"Years: {start_year}-{end_year}")
        print(f"Output: {output_dir}")
        print()

    def clean_text(self, text):
        """
        Clean encoding artifacts and special characters.
        Fixes characters like Â— → —
        """
        if not isinstance(text, str):
            return text

        # Common encoding issues in BRFSS codebooks
        replacements = {
            # Em-dashes
            'Â—': '—',
            'â€"': '—',
            'â€"': '-',

            # Quotes
            'â€™': "'",
            'â€˜': "'",
            'â€œ': '"',
            'â€': '"',

            # Spaces
            'Â ': ' ',
            '\xa0': ' ',

            # Other common issues
            'Ã©': 'é',
            'Ã¨': 'è',
            'Ã¡': 'á',
            'Ã³': 'ó',
            'Ã±': 'ñ',
        }

        for bad, good in replacements.items():
            text = text.replace(bad, good)

        return text.strip()

    def discover_all_datasets(self):
        """Discover BRFSS datasets by year."""
        print("STEP 1: Discovering datasets...")
        print("-" * 80)

        catalog = []

        for year in tqdm(range(self.start_year, self.end_year + 1), desc="Years"):
            # Try both .html (2012+) and .htm (1999-2011)
            year_urls = [
                f"{self.base_url}annual_data/annual_{year}.html",
                f"{self.base_url}annual_data/annual_{year}.htm"
            ]

            time.sleep(1)  # Rate limiting

            response = None
            year_url = None

            # Try both URL patterns
            for url in year_urls:
                try:
                    resp = requests.get(url, headers=self.headers, timeout=30)
                    if resp.status_code == 200:
                        response = resp
                        year_url = url
                        break
                except:
                    continue

            if not response:
                continue

            try:
                soup = BeautifulSoup(response.text, 'html.parser')

                data_url = None
                doc_url = None

                for link in soup.find_all('a', href=True):
                    href = link['href']
                    text = link.get_text(strip=True).lower()

                    if 'sas transport' in text and '.zip' in href.lower():
                        data_url = urljoin(year_url, href)

                    if 'codebook' in text:
                        if '.html' in href.lower() or '.pdf' in href.lower() or '.zip' in href.lower():
                            doc_url = urljoin(year_url, href)

                if data_url:
                    catalog.append({
                        'name': f"BRFSS {year}",
                        'year': year,
                        'data_url': data_url,
                        'doc_url': doc_url,
                        'year_page': year_url
                    })

            except Exception as e:
                print(f"  ⚠ Error for {year}: {str(e)}")

        print(f"✓ Found {len(catalog)} datasets")
        return catalog

    def download_dataset(self, dataset):
        """Download a single BRFSS dataset."""
        print(f"\nSTEP 2: Downloading {dataset['year']} data...")
        print("-" * 80)

        try:
            year = dataset['year']
            url = dataset['data_url']

            print(f"  Downloading from: {url}")
            print(f"  (This may take a while, ~100-200MB)")

            response = requests.get(url, headers=self.headers, timeout=300)

            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                xpt_files = [f for f in z.namelist() if f.strip().lower().endswith('.xpt')]

                if not xpt_files:
                    print(f"  ⚠ No .XPT file in archive")
                    return None

                xpt_filename = xpt_files[0]
                with z.open(xpt_filename) as xpt_file:
                    df = pd.read_sas(xpt_file, format='xport')

            print(f"  ✓ Loaded: {len(df):,} records × {len(df.columns)} variables")

            return {
                'name': dataset['name'],
                'year': year,
                'data': df,
                'metadata': dataset
            }

        except Exception as e:
            print(f"  ⚠ Download error: {str(e)}")
            return None

    def parse_html_codebook(self, html_content, year):
        """Parse HTML BRFSS codebook (2019, 2022-2024)."""
        print("  Parsing HTML codebook...")

        soup = BeautifulSoup(html_content, 'html.parser')
        tables = soup.find_all('table')

        variables_dict = {}
        codes_data = []

        for table in tables:
            table_text = table.get_text().replace('\xa0', ' ')

            if 'SAS Variable' not in table_text:
                continue

            rows = table.find_all('tr')
            if len(rows) < 2:
                continue

            metadata_cell = rows[0].find(['td', 'th'])
            if not metadata_cell:
                continue

            metadata_text = metadata_cell.get_text().replace('\xa0', ' ')

            # Extract variable name (Pattern 3 with lookahead)
            var_match = re.search(r'SAS Variable Name:\s*([A-Z_][A-Z0-9_]*?)(?=Question)', metadata_text)
            if not var_match:
                continue

            var_name = var_match.group(1)

            # Extract Label (description)
            label_match = re.search(r'Label:\s*(.+?)Section Name', metadata_text)
            if label_match:
                description = label_match.group(1).strip()
                description = re.sub(r'\s+', ' ', description)
                description = self.clean_text(description)
            else:
                description = f"Variable {var_name}"

            # Extract Section Name
            section_match = re.search(r'Section Name:\s*(.+?)(?=Section Number)', metadata_text)
            if section_match:
                section_name = section_match.group(1).strip()
                section_name = re.sub(r'\s+', ' ', section_name)
                section_name = self.clean_text(section_name)
            else:
                section_name = "Unknown"

            # Extract value codes
            value_codes = {}
            for row in rows[2:]:
                cells = row.find_all(['td', 'th'])
                if len(cells) >= 2:
                    code_value = cells[0].get_text(strip=True).replace('\xa0', ' ')
                    code_label = cells[1].get_text(strip=True).replace('\xa0', ' ')

                    code_label = re.sub(r'Go to Section.*$', '', code_label).strip()
                    code_label = re.sub(r'Notes:.*$', '', code_label).strip()
                    code_label = self.clean_text(code_label)

                    if code_value and code_label and len(code_label) < 200:
                        value_codes[code_value] = code_label

            # Store
            has_codes = len(value_codes) > 0
            value_codes_str = ';'.join([f"{code}={label}" for code, label in value_codes.items()]) if has_codes else ""

            variables_dict[var_name] = {
                'column': var_name,
                'base_variable': var_name,
                'description': description,
                'section_name': section_name,
                'has_codes': has_codes,
                'value_codes': value_codes_str,
                'source': f'BRFSS {year} - HTML Codebook'
            }

            for code, label in value_codes.items():
                codes_data.append({
                    'variable': var_name,
                    'code': code,
                    'label': label
                })

        dict_df = pd.DataFrame(list(variables_dict.values()))
        codes_df = pd.DataFrame(codes_data)

        print(f"    ✓ HTML: {len(dict_df)} variables, {len(codes_df)} value codes")

        return dict_df, codes_df

    def parse_pdf_codebook(self, pdf_content, year):
        """Parse PDF BRFSS codebook - handles both old and new formats."""
        print("  Parsing PDF codebook...")

        # Extract text from PDF
        with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
            full_text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n"

        print(f"    PDF: Extracted {len(full_text):,} characters")

        # Detect format: old (1999-2014) vs new (2015+)
        has_label = 'Label:' in full_text
        has_description = 'Description:' in full_text

        if has_label and 'Section Name:' in full_text:
            # New format (2015-2021) - has "Label:" and "Section Name:"
            print("    Detected: New PDF format (2015+)")
            return self._parse_new_pdf_format(full_text, year)
        elif has_description or year < 2015:
            # Old format (1999-2014) - has "Description:" instead of "Label:"
            print("    Detected: Old PDF format (1999-2014)")
            return self._parse_old_pdf_format(full_text, year)
        else:
            # Default to new format
            return self._parse_new_pdf_format(full_text, year)

    def _parse_new_pdf_format(self, full_text, year):
        """Parse new PDF format (2015-2021) - has 'Label:' and 'Section Name:'"""
        variables_dict = {}
        codes_data = []

        # Split by "Label:" (Label comes BEFORE Variable Name)
        sections = re.split(r'(?=Label:)', full_text)

        for section in sections:
            if 'Label:' not in section or 'SAS Variable Name:' not in section:
                continue

            # Extract variable name
            var_match = re.search(r'SAS Variable Name:\s*([A-Z_][A-Z0-9_]+)', section)
            if not var_match:
                continue

            var_name = var_match.group(1)

            # Check for Q/S/C/N/P suffix
            if len(var_name) > 3 and var_name[-1] in ['Q', 'S', 'C', 'N', 'P']:
                next_chars = section[var_match.end():var_match.end()+15]
                if any(word in next_chars for word in ['Question', 'Section', 'Column', 'Notes', 'Prologue']):
                    var_name = var_name[:-1]

            # Extract Label
            label_match = re.search(r'Label:\s*([^\n]+)', section)
            if label_match:
                description = label_match.group(1).strip()
                description = re.sub(r'Section Name:.*', '', description).strip()
                description = re.sub(r'\s+', ' ', description)
                description = self.clean_text(description)
            else:
                description = f"Variable {var_name}"

            # Extract Section Name
            section_match = re.search(r'Section Name:\s*([^\n]+)', section)
            if section_match:
                section_name = section_match.group(1).strip()
                section_name = re.sub(r'Section Number:.*', '', section_name).strip()
                section_name = re.sub(r'Core Section.*', '', section_name).strip()
                section_name = re.sub(r'Module Number:.*', '', section_name).strip()
                section_name = re.sub(r'\s+', ' ', section_name)
                section_name = self.clean_text(section_name)
            else:
                section_name = "Unknown"

            # Extract value codes
            value_codes = self._extract_value_codes_from_section(section)

            # Store
            has_codes = len(value_codes) > 0
            value_codes_str = ';'.join([f"{code}={label}" for code, label in value_codes.items()]) if has_codes else ""

            variables_dict[var_name] = {
                'column': var_name,
                'base_variable': var_name,
                'description': description,
                'section_name': section_name,
                'has_codes': has_codes,
                'value_codes': value_codes_str,
                'source': f'BRFSS {year} - PDF Codebook'
            }

            for code, label in value_codes.items():
                codes_data.append({
                    'variable': var_name,
                    'code': code,
                    'label': label
                })

        dict_df = pd.DataFrame(list(variables_dict.values()))
        codes_df = pd.DataFrame(codes_data)

        print(f"    ✓ PDF (new format): {len(dict_df)} variables, {len(codes_df)} value codes")

        return dict_df, codes_df

    def _parse_old_pdf_format(self, full_text, year):
        """Parse old PDF format (1999-2014) - description comes BEFORE 'Section:' marker"""
        variables_dict = {}
        codes_data = []

        # For old format, we need to find: [Description]\nSection:...\nSAS Variable Name:
        # Split by "Section:" to get each variable block
        sections = re.split(r'(?=Section:)', full_text)

        for section in sections:
            if 'SAS Variable Name:' not in section or 'Section:' not in section:
                continue

            # Extract variable name
            var_match = re.search(r'SAS Variable Name:\s*([A-Z_][A-Z0-9_]+)', section)
            if not var_match:
                continue

            var_name = var_match.group(1)

            # Extract description - it's BEFORE "Section:" in this section
            # Get text before "Section:"
            before_section = section.split('Section:')[0]

            # Description is typically the last non-empty line before "Section:"
            lines = [line.strip() for line in before_section.split('\n') if line.strip()]

            description = None
            if lines:
                # Get the last line before Section: that looks like a description
                for line in reversed(lines):
                    # Skip lines that are just numbers, page markers, or headers
                    if (len(line) > 3 and
                        not re.match(r'^\d+$', line) and
                        not re.match(r'^Page \d+', line) and
                        not re.match(r'^\d{4} BRFSS', line) and
                        not re.match(r'^Weighted$', line, re.IGNORECASE) and
                        not re.match(r'^Value\s+Value Label', line, re.IGNORECASE)):
                        description = line
                        break

            # Try alternate method: "Description:" field (for 2005+ old PDFs)
            if not description:
                desc_match = re.search(r'Description:\s*([^\n]+)', section)
                if desc_match:
                    description = desc_match.group(1).strip()

            # Fallback
            if not description or len(description) < 3:
                description = f"Variable {var_name}"
            else:
                description = re.sub(r'\s+', ' ', description)
                description = self.clean_text(description)

            # Extract Section name (format: "Section: 0.1 Record Identification")
            section_match = re.search(r'Section:\s*([^\n]+?)(?=Type:|$)', section)
            if section_match:
                section_text = section_match.group(1).strip()
                # Remove section number (e.g., "0.1")
                section_name = re.sub(r'^\d+(\.\d+)?\s+', '', section_text).strip()
                section_name = self.clean_text(section_name)
            else:
                section_name = "Unknown"

            # Extract value codes
            value_codes = self._extract_value_codes_from_section(section)

            # Store
            has_codes = len(value_codes) > 0
            value_codes_str = ';'.join([f"{code}={label}" for code, label in value_codes.items()]) if has_codes else ""

            variables_dict[var_name] = {
                'column': var_name,
                'base_variable': var_name,
                'description': description,
                'section_name': section_name,
                'has_codes': has_codes,
                'value_codes': value_codes_str,
                'source': f'BRFSS {year} - PDF Codebook (Old Format)'
            }

            for code, label in value_codes.items():
                codes_data.append({
                    'variable': var_name,
                    'code': code,
                    'label': label
                })

        dict_df = pd.DataFrame(list(variables_dict.values()))
        codes_df = pd.DataFrame(codes_data)

        print(f"    ✓ PDF (old format): {len(dict_df)} variables, {len(codes_df)} value codes")

        return dict_df, codes_df

    def _extract_value_codes_from_section(self, section):
        """Extract value codes from a PDF section (works for both old and new formats)"""
        value_codes = {}

        # Look for value table
        value_match = re.search(r'Value\s+Value Label', section, re.IGNORECASE)

        if value_match:
            value_section = section[value_match.end():]
            lines = value_section.split('\n')

            for line in lines[:50]:
                # Match pattern: number/BLANK followed by text
                code_match = re.match(r'^\s*(\d+|BLANK)\s+(.+)', line)
                if code_match:
                    code = code_match.group(1)
                    label = code_match.group(2).strip()

                    # Clean label - remove frequency/percentage columns
                    label = re.sub(r'Go to Section.*$', '', label).strip()
                    label = re.sub(r'Notes:.*$', '', label).strip()
                    label = re.sub(r'\s+', ' ', label)
                    # Remove trailing numbers (frequencies and percentages)
                    label = re.sub(r'\s+\d+\s+\d+\.\d+\s+\d+\.\d+\s*$', '', label).strip()
                    label = re.sub(r'\s+\d+\.\d+\s*$', '', label).strip()
                    label = re.sub(r'\d{3,}$', '', label).strip()  # Remove large numbers at end
                    label = self.clean_text(label)

                    if label and len(label) < 200 and not label.startswith(('Label:', 'Description:', 'Section:')):
                        value_codes[code] = label

        return value_codes

    def create_dictionary_with_codebook(self, data, year, codebook_file=None, codebook_url=None):
        """Create dictionary using codebook (file or URL)."""
        print(f"\nSTEP 3: Creating dictionary for {year}...")
        print("-" * 80)

        # Try manual file first
        if codebook_file and os.path.exists(codebook_file):
            print(f"  Using manual codebook: {codebook_file}")

            # Detect format
            if codebook_file.lower().endswith('.html'):
                # HTML
                for encoding in ['latin-1', 'utf-8', 'windows-1252']:
                    try:
                        with open(codebook_file, 'r', encoding=encoding) as f:
                            content = f.read()
                        return self.parse_html_codebook(content, year)
                    except:
                        continue

            elif codebook_file.lower().endswith('.pdf'):
                # PDF
                with open(codebook_file, 'rb') as f:
                    content = f.read()
                return self.parse_pdf_codebook(content, year)

            elif codebook_file.lower().endswith('.zip'):
                # ZIP - extract HTML or PDF from inside
                print(f"    Extracting codebook from ZIP...")
                try:
                    with zipfile.ZipFile(codebook_file, 'r') as z:
                        # Look for HTML or PDF inside
                        for filename in z.namelist():
                            if filename.lower().endswith('.html') or filename.lower().endswith('.htm'):
                                with z.open(filename) as f:
                                    content = f.read().decode('latin-1')
                                return self.parse_html_codebook(content, year)
                            elif filename.lower().endswith('.pdf'):
                                with z.open(filename) as f:
                                    content = f.read()
                                return self.parse_pdf_codebook(content, year)
                except Exception as e:
                    print(f"    ⚠ ZIP extraction error: {e}")

        # Try URL download
        elif codebook_url:
            print(f"  Trying to download codebook...")

            try:
                response = requests.get(codebook_url, headers=self.headers, timeout=60)

                if response.status_code == 200:
                    if '.html' in codebook_url.lower():
                        return self.parse_html_codebook(response.text, year)
                    elif '.pdf' in codebook_url.lower():
                        return self.parse_pdf_codebook(response.content, year)
                    elif '.zip' in codebook_url.lower():
                        # ZIP - extract HTML or PDF from inside
                        print(f"    Extracting codebook from ZIP...")
                        try:
                            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                                # Look for HTML or PDF inside
                                for filename in z.namelist():
                                    if filename.lower().endswith('.html') or filename.lower().endswith('.htm'):
                                        with z.open(filename) as f:
                                            content = f.read().decode('latin-1')
                                        return self.parse_html_codebook(content, year)
                                    elif filename.lower().endswith('.pdf'):
                                        with z.open(filename) as f:
                                            content = f.read()
                                        return self.parse_pdf_codebook(content, year)
                        except Exception as e:
                            print(f"    ⚠ ZIP extraction error: {e}")
                else:
                    print(f"    ⚠ Download failed: Status {response.status_code}")

            except Exception as e:
                print(f"    ⚠ Download error: {str(e)}")

        # Fallback: Basic dictionary
        print("  ⚠ Creating basic dictionary (no codebook)")
        return self.create_basic_dictionary(data, year)

    def create_basic_dictionary(self, data, year):
        """Create basic dictionary when codebook unavailable."""
        dict_data = []

        for col in data.columns:
            dict_data.append({
                'column': col,
                'base_variable': col,
                'description': f'Variable {col} (see codebook)',
                'section_name': 'Unknown',
                'has_codes': False,
                'value_codes': '',
                'source': f'BRFSS {year} - Basic dictionary'
            })

        dict_df = pd.DataFrame(dict_data)
        codes_df = pd.DataFrame(columns=['variable', 'code', 'label'])

        print(f"    ✓ Basic: {len(dict_df)} variables")

        return dict_df, codes_df

    def save_outputs(self, data, dictionary, value_codes, year=None):
        """Save all outputs."""
        year_str = f"_{year}" if year else ""

        data_file = os.path.join(self.output_dir, f'brfss_data{year_str}.csv')
        dict_file = os.path.join(self.output_dir, f'brfss_dictionary{year_str}.csv')
        codes_file = os.path.join(self.output_dir, f'brfss_value_codes{year_str}.csv')

        data.to_csv(data_file, index=False)
        dictionary.to_csv(dict_file, index=False)

        if len(value_codes) > 0:
            value_codes.to_csv(codes_file, index=False)

        print(f"\n💾 Saved outputs:")
        print(f"  ✓ {data_file}")
        print(f"  ✓ {dict_file}")
        if len(value_codes) > 0:
            print(f"  ✓ {codes_file}")

    def extract_all(self, codebook_file=None):
        """
        Run complete extraction workflow.

        Args:
            codebook_file: Optional manual codebook file path
        """
        print("🚀 Starting BRFSS extraction...\n")

        start_time = time.time()

        # Discover datasets
        catalog = self.discover_all_datasets()

        if not catalog:
            print("❌ No datasets found")
            return None, None, None

        # Process each year
        for dataset_info in catalog:
            year = dataset_info['year']

            # Download data
            dataset = self.download_dataset(dataset_info)

            if not dataset:
                continue

            data = dataset['data']

            # Create dictionary with codebook
            dict_df, codes_df = self.create_dictionary_with_codebook(
                data,
                year,
                codebook_file=codebook_file,
                codebook_url=dataset_info.get('doc_url')
            )

            # Check coverage
            if len(dict_df) > 0:
                data_vars = set(data.columns)
                dict_vars = set(dict_df['column'])
                coverage = len(data_vars & dict_vars) / len(data_vars) * 100

                print(f"\n📊 Coverage: {coverage:.1f}%")

                if coverage < 90:
                    missing = data_vars - dict_vars
                    print(f"  ⚠ Missing {len(missing)} variables from dictionary")

            # Save
            self.save_outputs(data, dict_df, codes_df, year)

        elapsed = time.time() - start_time

        print("\n" + "=" * 80)
        print("✅ EXTRACTION COMPLETE")
        print("=" * 80)
        print(f"⏱️  Time: {elapsed/60:.1f} minutes")
        print("=" * 80)

        # Return last year's data (for single-year extraction)
        if len(catalog) == 1:
            return data, dict_df, codes_df

        return None, None, None


# ============================================================================
# USAGE EXAMPLES
# ============================================================================

if __name__ == "__main__":
    print(__doc__)

    print("\n" + "="*80)
    print("EXAMPLE USAGE")
    print("="*80)

    print("""
# Example 1: Extract with manual codebook (recommended)
extractor = BRFSSExtractor(start_year=2019, end_year=2019,
                           output_dir='brfss_2019')
data, dictionary, value_codes = extractor.extract_all(
    codebook_file='codeBook2019.html'  # Your downloaded codebook
)

# Example 2: Extract multiple years (one at a time to avoid RAM issues)
for year in [2019, 2020, 2021]:
    extractor = BRFSSExtractor(start_year=year, end_year=year,
                               output_dir=f'brfss_{year}')
    extractor.extract_all(codebook_file=f'codebook{year}.html')

# Example 3: Try automated download (may be blocked by CDC)
extractor = BRFSSExtractor(start_year=2019, end_year=2019)
extractor.extract_all()  # Will try to download codebook automatically
    """)


BRFSS DATA EXTRACTOR - FINAL COMPLETE VERSION
Extracts BRFSS data (2011-2024) with complete documentation parsing.

Features:
- Downloads BRFSS data for any year
- Parses HTML codebooks (2019, 2022-2024)
- Parses PDF codebooks (2011-2018, 2020-2021)
- Outputs NHANES-format dictionary with real descriptions
- Extracts value codes (1=Yes, 2=No, etc.)
- Handles duplicates automatically
- Year-at-a-time processing to avoid RAM issues

Usage:
    # Extract 2019 data
    extractor = BRFSSExtractor(start_year=2019, end_year=2019, 
                               output_dir='brfss_2019')
    data, dictionary, value_codes = extractor.extract_all()
    
    # With manual codebook (if CDC blocks automated download)
    extractor.extract_with_manual_codebook('codeBook2019.html')


EXAMPLE USAGE

# Example 1: Extract with manual codebook (recommended)
extractor = BRFSSExtractor(start_year=2019, end_year=2019, 
                           output_dir='brfss_2019')
data, dictionary, value_codes = extra

In [None]:
"""
EXTRACT ALL YEARS 1999-2024 (AS-IS)
====================================
Don't worry about perfect descriptions - we'll use AI to fill gaps
"""

import gc
import time

print("="*80)
print("🚀 EXTRACTING ALL YEARS 1999-2024")
print("="*80)
print()

START_YEAR = 1999
END_YEAR = 2024
BASE_OUTPUT_DIR = 'brfss_complete_1999_2024'

total_years = END_YEAR - START_YEAR + 1

print(f"Years: {START_YEAR}-{END_YEAR} ({total_years} years)")
print(f"Output: {BASE_OUTPUT_DIR}/")
print()
print("⏱️  Estimated time: ~50 minutes")
print()
print("Note: Some old years may have partial descriptions.")
print("That's OK - AI will fill gaps using reference from complete years!")
print()
print("Starting in 5 seconds...")
time.sleep(5)

results = []
start_time = time.time()

for i, year in enumerate(range(START_YEAR, END_YEAR + 1), 1):
    print()
    print("="*80)
    print(f"📅 YEAR {year} ({i}/{total_years})")
    print("="*80)

    year_start = time.time()
    output_dir = f'{BASE_OUTPUT_DIR}/brfss_{year}'

    try:
        extractor = BRFSSExtractor(
            start_year=year,
            end_year=year,
            output_dir=output_dir
        )

        data, dictionary, value_codes = extractor.extract_all()

        year_time = time.time() - year_start

        if data is not None and len(dictionary) > 0:
            real_desc = dictionary[~dictionary['description'].str.startswith('Variable ', na=False)]
            quality = len(real_desc) / len(dictionary) * 100

            results.append({
                'year': year,
                'status': 'SUCCESS',
                'records': len(data),
                'variables': len(data.columns),
                'dict_quality': quality,
                'time': year_time
            })

            print(f"✅ {year}: {len(data):,} records, {len(dictionary)} vars, {quality:.0f}% quality, {year_time:.1f}s")
        else:
            results.append({
                'year': year,
                'status': 'FAILED',
                'records': 0,
                'variables': 0,
                'dict_quality': 0,
                'time': year_time
            })
            print(f"❌ {year} failed")

        del extractor, data, dictionary, value_codes
        gc.collect()

    except Exception as e:
        year_time = time.time() - year_start
        results.append({
            'year': year,
            'status': 'ERROR',
            'records': 0,
            'variables': 0,
            'dict_quality': 0,
            'time': year_time
        })
        print(f"❌ {year} error: {str(e)}")

    # Progress
    elapsed = time.time() - start_time
    avg_time = elapsed / i
    remaining = (total_years - i) * avg_time
    print(f"⏱️  Progress: {i}/{total_years} | Elapsed: {elapsed/60:.1f}m | Remaining: ~{remaining/60:.1f}m")

# Summary
total_time = time.time() - start_time

print()
print("="*80)
print("🎉 EXTRACTION COMPLETE!")
print("="*80)
print()

successful = [r for r in results if r['status'] == 'SUCCESS']
high_quality = [r for r in successful if r['dict_quality'] >= 90]
partial_quality = [r for r in successful if 50 <= r['dict_quality'] < 90]
low_quality = [r for r in successful if r['dict_quality'] < 50]

print(f"✅ Successful: {len(successful)}/{total_years} years")
print(f"   High quality (90%+): {len(high_quality)} years")
print(f"   Partial (50-90%): {len(partial_quality)} years")
print(f"   Needs AI help (<50%): {len(low_quality)} years")
print(f"⏱️  Total time: {total_time/60:.1f} minutes")
print()

print("Years needing AI assistance:")
for r in low_quality:
    print(f"  {r['year']}: {r['dict_quality']:.0f}% quality")

print()
print(f"📁 All data saved in: {BASE_OUTPUT_DIR}/")
print()
print("🤖 Next: AI Instrument will use complete years (2015-2024)")
print("   as reference to fill in missing descriptions!")
print()
print("="*80)

🚀 EXTRACTING ALL YEARS 1999-2024

Years: 1999-2024 (26 years)
Output: brfss_complete_1999_2024/

⏱️  Estimated time: ~50 minutes

Note: Some old years may have partial descriptions.
That's OK - AI will fill gaps using reference from complete years!

Starting in 5 seconds...

📅 YEAR 1999 (1/26)
🔍 BRFSS DATA EXTRACTOR - FINAL COMPLETE VERSION
Years: 1999-1999
Output: brfss_complete_1999_2024/brfss_1999

🚀 Starting BRFSS extraction...

STEP 1: Discovering datasets...
--------------------------------------------------------------------------------


Years: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it]


✓ Found 1 datasets

STEP 2: Downloading 1999 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/1999/files/CDBRFS99XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 159,989 records × 281 variables

STEP 3: Creating dictionary for 1999...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 139,833 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 281 variables, 1561 value codes

📊 Coverage: 99.6%

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_1999/brfss_data_1999.csv
  ✓ brfss_complete_1999_2024/brfss_1999/brfss_dictionary_1999.csv
  ✓ brfss_complete_1999_2024/brfss_1999/brfss_value_codes_1999.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 0.8 minutes
✅ 1999: 159,989 records, 281 vars, 78% quality, 50.5s
⏱️  Progress: 1/26 | Elapsed: 0.9m | Remaining: ~21.3m

Years: 100%|██████████| 1/1 [00:01<00:00,  1.93s/it]


✓ Found 1 datasets

STEP 2: Downloading 2000 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2000/files/CDBRFS00XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 184,450 records × 289 variables

STEP 3: Creating dictionary for 2000...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 150,600 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 289 variables, 1596 value codes

📊 Coverage: 99.7%

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2000/brfss_data_2000.csv
  ✓ brfss_complete_1999_2024/brfss_2000/brfss_dictionary_2000.csv
  ✓ brfss_complete_1999_2024/brfss_2000/brfss_value_codes_2000.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 1.0 minutes
✅ 2000: 184,450 records, 289 vars, 77% quality, 58.3s
⏱️  Progress: 2/26 | Elapsed: 1.8m | Remaining: ~22.0m

Years: 100%|██████████| 1/1 [00:02<00:00,  2.47s/it]


✓ Found 1 datasets

STEP 2: Downloading 2001 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2001/files/CDBRFS01XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 212,510 records × 291 variables

STEP 3: Creating dictionary for 2001...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 192,766 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 0 variables, 0 value codes

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2001/brfss_data_2001.csv
  ✓ brfss_complete_1999_2024/brfss_2001/brfss_dictionary_2001.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 1.2 minutes
❌ 2001 failed
⏱️  Progress: 3/26 | Elapsed: 3.0m | Remaining: ~23.4m

📅 YEAR 2002 (4/26)
🔍 BRFSS DATA EXTRACTOR - FINAL COMPLETE VERSION
Years: 2002-2002
Output: brfss_complete_1999_2024/brfss_2002



Years: 100%|██████████| 1/1 [00:02<00:00,  2.30s/it]


✓ Found 1 datasets

STEP 2: Downloading 2002 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2002/files/CDBRFS02XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 247,964 records × 310 variables

STEP 3: Creating dictionary for 2002...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 181,192 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 101 variables, 706 value codes

📊 Coverage: 32.6%
  ⚠ Missing 209 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2002/brfss_data_2002.csv
  ✓ brfss_complete_1999_2024/brfss_2002/brfss_dictionary_2002.csv
  ✓ brfss_complete_1999_2024/brfss_2002/brfss_value_codes_2002.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 1.8 minutes
✅ 2002: 247,964 records, 101 vars, 100% quality, 108.2s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.35s/it]


✓ Found 1 datasets

STEP 2: Downloading 2003 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2003/files/CDBRFS03XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 264,684 records × 294 variables

STEP 3: Creating dictionary for 2003...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 183,616 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 114 variables, 726 value codes

📊 Coverage: 37.1%
  ⚠ Missing 185 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2003/brfss_data_2003.csv
  ✓ brfss_complete_1999_2024/brfss_2003/brfss_dictionary_2003.csv
  ✓ brfss_complete_1999_2024/brfss_2003/brfss_value_codes_2003.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 1.8 minutes
✅ 2003: 264,684 records, 114 vars, 100% quality, 107.1s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.58s/it]


✓ Found 1 datasets

STEP 2: Downloading 2004 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2004/files/CDBRFS04XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 303,822 records × 293 variables

STEP 3: Creating dictionary for 2004...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 186,714 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 112 variables, 741 value codes

📊 Coverage: 36.9%
  ⚠ Missing 185 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2004/brfss_data_2004.csv
  ✓ brfss_complete_1999_2024/brfss_2004/brfss_dictionary_2004.csv
  ✓ brfss_complete_1999_2024/brfss_2004/brfss_value_codes_2004.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 1.9 minutes
✅ 2004: 303,822 records, 112 vars, 100% quality, 113.3s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]


✓ Found 1 datasets

STEP 2: Downloading 2005 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2005/files/CDBRFS05XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 356,112 records × 325 variables

STEP 3: Creating dictionary for 2005...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 197,088 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 101 variables, 653 value codes

📊 Coverage: 30.8%
  ⚠ Missing 225 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2005/brfss_data_2005.csv
  ✓ brfss_complete_1999_2024/brfss_2005/brfss_dictionary_2005.csv
  ✓ brfss_complete_1999_2024/brfss_2005/brfss_value_codes_2005.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 2.3 minutes
✅ 2005: 356,112 records, 101 vars, 100% quality, 139.9s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:01<00:00,  1.85s/it]


✓ Found 1 datasets

STEP 2: Downloading 2006 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2006/files/CDBRFS06XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 355,710 records × 302 variables

STEP 3: Creating dictionary for 2006...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 188,539 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 102 variables, 655 value codes

📊 Coverage: 33.4%
  ⚠ Missing 201 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2006/brfss_data_2006.csv
  ✓ brfss_complete_1999_2024/brfss_2006/brfss_dictionary_2006.csv
  ✓ brfss_complete_1999_2024/brfss_2006/brfss_value_codes_2006.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 2.3 minutes
✅ 2006: 355,710 records, 102 vars, 100% quality, 135.9s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


✓ Found 1 datasets

STEP 2: Downloading 2007 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2007/files/CDBRFS07XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 430,912 records × 362 variables

STEP 3: Creating dictionary for 2007...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 224,098 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 107 variables, 634 value codes

📊 Coverage: 29.6%
  ⚠ Missing 255 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2007/brfss_data_2007.csv
  ✓ brfss_complete_1999_2024/brfss_2007/brfss_dictionary_2007.csv
  ✓ brfss_complete_1999_2024/brfss_2007/brfss_value_codes_2007.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.5 minutes
✅ 2007: 430,912 records, 107 vars, 100% quality, 207.0s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.01s/it]


✓ Found 1 datasets

STEP 2: Downloading 2008 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2008/files/CDBRFS08XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 414,509 records × 291 variables

STEP 3: Creating dictionary for 2008...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 182,313 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 106 variables, 649 value codes

📊 Coverage: 36.4%
  ⚠ Missing 185 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2008/brfss_data_2008.csv
  ✓ brfss_complete_1999_2024/brfss_2008/brfss_dictionary_2008.csv
  ✓ brfss_complete_1999_2024/brfss_2008/brfss_value_codes_2008.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 2.6 minutes
✅ 2008: 414,509 records, 106 vars, 100% quality, 157.5s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:03<00:00,  3.26s/it]


✓ Found 1 datasets

STEP 2: Downloading 2009 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2009/files/CDBRFS09XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 432,607 records × 405 variables

STEP 3: Creating dictionary for 2009...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 251,799 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 116 variables, 725 value codes

📊 Coverage: 28.6%
  ⚠ Missing 289 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2009/brfss_data_2009.csv
  ✓ brfss_complete_1999_2024/brfss_2009/brfss_dictionary_2009.csv
  ✓ brfss_complete_1999_2024/brfss_2009/brfss_value_codes_2009.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.6 minutes
✅ 2009: 432,607 records, 116 vars, 100% quality, 217.7s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]


✓ Found 1 datasets

STEP 2: Downloading 2010 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2010/files/CDBRFS10XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 451,075 records × 397 variables

STEP 3: Creating dictionary for 2010...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 259,102 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 113 variables, 688 value codes

📊 Coverage: 28.2%
  ⚠ Missing 285 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2010/brfss_data_2010.csv
  ✓ brfss_complete_1999_2024/brfss_2010/brfss_dictionary_2010.csv
  ✓ brfss_complete_1999_2024/brfss_2010/brfss_value_codes_2010.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.7 minutes
✅ 2010: 451,075 records, 113 vars, 100% quality, 220.4s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]


✓ Found 1 datasets

STEP 2: Downloading 2011 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2011/files/LLCP2011XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 506,467 records × 454 variables

STEP 3: Creating dictionary for 2011...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 285,915 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 108 variables, 815 value codes

📊 Coverage: 23.8%
  ⚠ Missing 346 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2011/brfss_data_2011.csv
  ✓ brfss_complete_1999_2024/brfss_2011/brfss_dictionary_2011.csv
  ✓ brfss_complete_1999_2024/brfss_2011/brfss_value_codes_2011.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 4.5 minutes
✅ 2011: 506,467 records, 108 vars, 100% quality, 270.4s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:01<00:00,  1.69s/it]


✓ Found 1 datasets

STEP 2: Downloading 2012 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2012/files/LLCP2012XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 475,687 records × 359 variables

STEP 3: Creating dictionary for 2012...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 238,513 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 110 variables, 689 value codes

📊 Coverage: 30.6%
  ⚠ Missing 249 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2012/brfss_data_2012.csv
  ✓ brfss_complete_1999_2024/brfss_2012/brfss_dictionary_2012.csv
  ✓ brfss_complete_1999_2024/brfss_2012/brfss_value_codes_2012.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.3 minutes
✅ 2012: 475,687 records, 110 vars, 100% quality, 199.7s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:01<00:00,  1.72s/it]


✓ Found 1 datasets

STEP 2: Downloading 2013 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2013/files/LLCP2013XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 491,773 records × 336 variables

STEP 3: Creating dictionary for 2013...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 218,702 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 102 variables, 812 value codes

📊 Coverage: 30.4%
  ⚠ Missing 234 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2013/brfss_data_2013.csv
  ✓ brfss_complete_1999_2024/brfss_2013/brfss_dictionary_2013.csv
  ✓ brfss_complete_1999_2024/brfss_2013/brfss_value_codes_2013.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.4 minutes
✅ 2013: 491,773 records, 102 vars, 100% quality, 202.9s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]


✓ Found 1 datasets

STEP 2: Downloading 2014 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2014/files/LLCP2014XPT.ZIP
  (This may take a while, ~100-200MB)
  ✓ Loaded: 464,664 records × 279 variables

STEP 3: Creating dictionary for 2014...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 187,381 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 102 variables, 677 value codes

📊 Coverage: 36.6%
  ⚠ Missing 177 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2014/brfss_data_2014.csv
  ✓ brfss_complete_1999_2024/brfss_2014/brfss_dictionary_2014.csv
  ✓ brfss_complete_1999_2024/brfss_2014/brfss_value_codes_2014.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 2.6 minutes
✅ 2014: 464,664 records, 102 vars, 100% quality, 156.2s
⏱️  Progres

Years: 100%|██████████| 1/1 [00:02<00:00,  2.55s/it]


✓ Found 1 datasets

STEP 2: Downloading 2015 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2015/files/LLCP2015XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 441,456 records × 330 variables

STEP 3: Creating dictionary for 2015...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 213,733 characters
    Detected: Old PDF format (1999-2014)
    ✓ PDF (old format): 94 variables, 780 value codes

📊 Coverage: 28.5%
  ⚠ Missing 236 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2015/brfss_data_2015.csv
  ✓ brfss_complete_1999_2024/brfss_2015/brfss_dictionary_2015.csv
  ✓ brfss_complete_1999_2024/brfss_2015/brfss_value_codes_2015.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.6 minutes
✅ 2015: 441,456 records, 94 vars, 100% quality, 215.3s
⏱️  Progress:

Years: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


✓ Found 1 datasets

STEP 2: Downloading 2016 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2016/files/LLCP2016XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 486,303 records × 275 variables

STEP 3: Creating dictionary for 2016...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 276,015 characters
    Detected: New PDF format (2015+)
    ✓ PDF (new format): 416 variables, 1906 value codes

📊 Coverage: 87.6%
  ⚠ Missing 34 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2016/brfss_data_2016.csv
  ✓ brfss_complete_1999_2024/brfss_2016/brfss_dictionary_2016.csv
  ✓ brfss_complete_1999_2024/brfss_2016/brfss_value_codes_2016.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.6 minutes
✅ 2016: 486,303 records, 416 vars, 100% quality, 218.4s
⏱️  Progress: 1

Years: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]


✓ Found 1 datasets

STEP 2: Downloading 2017 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2017/files/LLCP2017XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 450,016 records × 358 variables

STEP 3: Creating dictionary for 2017...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 267,192 characters
    Detected: New PDF format (2015+)
    ✓ PDF (new format): 358 variables, 2262 value codes

📊 Coverage: 89.9%
  ⚠ Missing 36 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2017/brfss_data_2017.csv
  ✓ brfss_complete_1999_2024/brfss_2017/brfss_dictionary_2017.csv
  ✓ brfss_complete_1999_2024/brfss_2017/brfss_value_codes_2017.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 4.4 minutes
✅ 2017: 450,016 records, 358 vars, 100% quality, 263.1s
⏱️  Progress: 1

Years: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]


✓ Found 1 datasets

STEP 2: Downloading 2018 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2018/files/LLCP2018XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 437,436 records × 275 variables

STEP 3: Creating dictionary for 2018...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 207,448 characters
    Detected: New PDF format (2015+)
    ✓ PDF (new format): 273 variables, 1752 value codes

📊 Coverage: 89.1%
  ⚠ Missing 30 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2018/brfss_data_2018.csv
  ✓ brfss_complete_1999_2024/brfss_2018/brfss_dictionary_2018.csv
  ✓ brfss_complete_1999_2024/brfss_2018/brfss_value_codes_2018.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.1 minutes
✅ 2018: 437,436 records, 273 vars, 100% quality, 183.4s
⏱️  Progress: 2

Years: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it]


✓ Found 1 datasets

STEP 2: Downloading 2019 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2019/files/LLCP2019XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 418,268 records × 342 variables

STEP 3: Creating dictionary for 2019...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing HTML codebook...
    ✓ HTML: 342 variables, 1900 value codes

📊 Coverage: 100.0%

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2019/brfss_data_2019.csv
  ✓ brfss_complete_1999_2024/brfss_2019/brfss_dictionary_2019.csv
  ✓ brfss_complete_1999_2024/brfss_2019/brfss_value_codes_2019.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.3 minutes
✅ 2019: 418,268 records, 342 vars, 100% quality, 196.3s
⏱️  Progress: 21/26 | Elapsed: 58.8m | Remaining: ~14.0m

📅 YEAR 2020 (22/26)
🔍 BRFSS DATA EXTRACTOR - FINAL COMPLETE VERSION
Years: 2020-202

Years: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]


✓ Found 1 datasets

STEP 2: Downloading 2020 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2020/files/LLCP2020XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 401,958 records × 279 variables

STEP 3: Creating dictionary for 2020...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 210,869 characters
    Detected: New PDF format (2015+)
    ✓ PDF (new format): 279 variables, 1708 value codes

📊 Coverage: 88.5%
  ⚠ Missing 32 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2020/brfss_data_2020.csv
  ✓ brfss_complete_1999_2024/brfss_2020/brfss_dictionary_2020.csv
  ✓ brfss_complete_1999_2024/brfss_2020/brfss_value_codes_2020.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 2.6 minutes
✅ 2020: 401,958 records, 279 vars, 100% quality, 158.4s
⏱️  Progress: 2

Years: 100%|██████████| 1/1 [00:01<00:00,  1.80s/it]


✓ Found 1 datasets

STEP 2: Downloading 2021 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2021/files/LLCP2021XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 438,693 records × 303 variables

STEP 3: Creating dictionary for 2021...
--------------------------------------------------------------------------------
  Trying to download codebook...
  Parsing PDF codebook...
    PDF: Extracted 206,307 characters
    Detected: New PDF format (2015+)
    ✓ PDF (new format): 302 variables, 1822 value codes

📊 Coverage: 87.8%
  ⚠ Missing 37 variables from dictionary

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2021/brfss_data_2021.csv
  ✓ brfss_complete_1999_2024/brfss_2021/brfss_dictionary_2021.csv
  ✓ brfss_complete_1999_2024/brfss_2021/brfss_value_codes_2021.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.2 minutes
✅ 2021: 438,693 records, 302 vars, 100% quality, 190.3s
⏱️  Progress: 2

Years: 100%|██████████| 1/1 [00:02<00:00,  2.48s/it]


✓ Found 1 datasets

STEP 2: Downloading 2022 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2022/files/LLCP2022XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 445,132 records × 328 variables

STEP 3: Creating dictionary for 2022...
--------------------------------------------------------------------------------
  Trying to download codebook...
    Extracting codebook from ZIP...
  Parsing HTML codebook...
    ✓ HTML: 324 variables, 1861 value codes

📊 Coverage: 98.8%

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2022/brfss_data_2022.csv
  ✓ brfss_complete_1999_2024/brfss_2022/brfss_dictionary_2022.csv
  ✓ brfss_complete_1999_2024/brfss_2022/brfss_value_codes_2022.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.2 minutes
✅ 2022: 445,132 records, 324 vars, 100% quality, 189.2s
⏱️  Progress: 24/26 | Elapsed: 67.8m | Remaining: ~5.7m

📅 YEAR 2023 (25/26)
🔍 BRFSS DATA EXTRACTOR - FINA

Years: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]


✓ Found 1 datasets

STEP 2: Downloading 2023 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2023/files/LLCP2023XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 433,323 records × 350 variables

STEP 3: Creating dictionary for 2023...
--------------------------------------------------------------------------------
  Trying to download codebook...
    Extracting codebook from ZIP...
  Parsing HTML codebook...
    ✓ HTML: 344 variables, 1849 value codes

📊 Coverage: 98.0%

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2023/brfss_data_2023.csv
  ✓ brfss_complete_1999_2024/brfss_2023/brfss_dictionary_2023.csv
  ✓ brfss_complete_1999_2024/brfss_2023/brfss_value_codes_2023.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 3.3 minutes
✅ 2023: 433,323 records, 344 vars, 100% quality, 200.8s
⏱️  Progress: 25/26 | Elapsed: 71.2m | Remaining: ~2.8m

📅 YEAR 2024 (26/26)
🔍 BRFSS DATA EXTRACTOR - FINA

Years: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]


✓ Found 1 datasets

STEP 2: Downloading 2024 data...
--------------------------------------------------------------------------------
  Downloading from: https://www.cdc.gov/brfss/annual_data/2024/files/LLCP2024XPT.zip
  (This may take a while, ~100-200MB)
  ✓ Loaded: 457,670 records × 301 variables

STEP 3: Creating dictionary for 2024...
--------------------------------------------------------------------------------
  Trying to download codebook...
    Extracting codebook from ZIP...
  Parsing HTML codebook...
    ✓ HTML: 297 variables, 1615 value codes

📊 Coverage: 98.3%

💾 Saved outputs:
  ✓ brfss_complete_1999_2024/brfss_2024/brfss_data_2024.csv
  ✓ brfss_complete_1999_2024/brfss_2024/brfss_dictionary_2024.csv
  ✓ brfss_complete_1999_2024/brfss_2024/brfss_value_codes_2024.csv

✅ EXTRACTION COMPLETE
⏱️  Time: 2.8 minutes
✅ 2024: 457,670 records, 297 vars, 100% quality, 166.9s
⏱️  Progress: 26/26 | Elapsed: 74.0m | Remaining: ~0.0m

🎉 EXTRACTION COMPLETE!

✅ Successful: 25/26 years

In [None]:
"""
BRFSS CODEBOOK PARSER - COMPLETE VERSION
=========================================
Parses both HTML and PDF BRFSS codebooks to extract:
- Variable names
- Descriptions (from Labels)
- Section names (like NHANES components)
- Value codes (1=Yes, 2=No, etc.)

Handles:
- 2019: HTML format
- 2011-2018, 2020-2021: PDF format
- 2022-2024: ZIP files (containing PDFs or other formats)
"""
!pip install pdfplumber
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pdfplumber
import io


def parse_html_codebook(html_content):
    """
    Parse HTML BRFSS codebook (2019 format).

    Args:
        html_content: HTML text content

    Returns:
        dict_df: DataFrame with variable dictionary
        codes_df: DataFrame with value codes
    """
    print("   Parsing HTML codebook...")

    soup = BeautifulSoup(html_content, 'html.parser')
    tables = soup.find_all('table')

    variables_dict = {}
    codes_data = []

    for table in tables:
        table_text = table.get_text().replace('\xa0', ' ')

        if 'SAS Variable' not in table_text:
            continue

        rows = table.find_all('tr')
        if len(rows) < 2:
            continue

        metadata_cell = rows[0].find(['td', 'th'])
        if not metadata_cell:
            continue

        metadata_text = metadata_cell.get_text().replace('\xa0', ' ')

        # Extract variable name (Pattern 3 - with lookahead)
        var_match = re.search(r'SAS Variable Name:\s*([A-Z_][A-Z0-9_]*?)(?=Question)', metadata_text)
        if not var_match:
            continue

        var_name = var_match.group(1)

        # Extract description from Label
        label_match = re.search(r'Label:\s*(.+?)Section Name', metadata_text)
        if label_match:
            description = label_match.group(1).strip()
            description = re.sub(r'\s+', ' ', description)
        else:
            description = f"Variable {var_name}"

        # Extract section name
        section_match = re.search(r'Section Name:\s*(.+?)(?=Section Number)', metadata_text)
        if section_match:
            section_name = section_match.group(1).strip()
            section_name = re.sub(r'\s+', ' ', section_name)
        else:
            section_name = "Unknown"

        # Extract value codes from table rows
        value_codes = {}
        for row in rows[2:]:
            cells = row.find_all(['td', 'th'])
            if len(cells) >= 2:
                code_value = cells[0].get_text(strip=True).replace('\xa0', ' ')
                code_label = cells[1].get_text(strip=True).replace('\xa0', ' ')

                # Clean label
                code_label = re.sub(r'Go to Section.*$', '', code_label).strip()
                code_label = re.sub(r'Notes:.*$', '', code_label).strip()

                if code_value and code_label and len(code_label) < 200:
                    value_codes[code_value] = code_label

        # Store
        has_codes = len(value_codes) > 0
        value_codes_str = ';'.join([f"{code}={label}" for code, label in value_codes.items()]) if has_codes else ""

        variables_dict[var_name] = {
            'column': var_name,
            'base_variable': var_name,
            'description': description,
            'section_name': section_name,
            'has_codes': has_codes,
            'value_codes': value_codes_str,
            'source': 'BRFSS Survey - HTML Codebook'
        }

        for code, label in value_codes.items():
            codes_data.append({
                'variable': var_name,
                'code': code,
                'label': label
            })

    dict_df = pd.DataFrame(list(variables_dict.values()))
    codes_df = pd.DataFrame(codes_data)

    print(f"      ✓ HTML: {len(dict_df)} variables, {len(codes_df)} value codes")

    return dict_df, codes_df

def parse_pdf_codebook(pdf_content):
    """
    FIXED: Parse PDF BRFSS codebook with correct alignment.
    """
    print("   Parsing PDF codebook...")

    variables_dict = {}
    codes_data = []

    # Extract text
    import pdfplumber
    import io

    with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
        full_text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"

    print(f"      PDF: Extracted {len(full_text):,} characters")

    # CRITICAL FIX: Split by "Label:" not "SAS Variable Name:"
    # Because Label comes BEFORE Variable Name in each section
    sections = re.split(r'(?=Label:)', full_text)

    for section in sections:
        # Skip if missing key components
        if 'Label:' not in section or 'SAS Variable Name:' not in section:
            continue

        # Now Label and Variable Name are in the SAME section
        # Extract variable name
        var_match = re.search(r'SAS Variable Name:\s*([A-Z_][A-Z0-9_]+)', section)
        if not var_match:
            continue

        var_name = var_match.group(1)

        # Check for Q/S/C/N/P suffix (concatenation bug)
        if len(var_name) > 3 and var_name[-1] in ['Q', 'S', 'C', 'N', 'P']:
            next_chars = section[var_match.end():var_match.end()+15]
            if any(word in next_chars for word in ['Question', 'Section', 'Column', 'Notes', 'Prologue']):
                var_name = var_name[:-1]

        # Extract Label (description) - same section, comes before Variable Name
        label_match = re.search(r'Label:\s*([^\n]+)', section)
        if label_match:
            description = label_match.group(1).strip()
            # Clean up
            description = re.sub(r'Section Name:.*', '', description).strip()
            description = re.sub(r'\s+', ' ', description)
        else:
            description = f"Variable {var_name}"

        # Extract Section Name
        section_match = re.search(r'Section Name:\s*([^\n]+)', section)
        if section_match:
            section_name = section_match.group(1).strip()
            # Clean up
            section_name = re.sub(r'Section Number:.*', '', section_name).strip()
            section_name = re.sub(r'Core Section.*', '', section_name).strip()
            section_name = re.sub(r'Module Number:.*', '', section_name).strip()
            section_name = re.sub(r'\s+', ' ', section_name)
        else:
            section_name = "Unknown"

        # Extract value codes
        value_codes = {}

        # Find value table section
        value_match = re.search(r'Value\s+Value Label', section, re.IGNORECASE)
        if value_match:
            # Get everything after the Value/Value Label header
            value_section = section[value_match.end():]

            # Match lines: "1    Yes" or "BLANK    Not asked"
            # Stop when we hit a non-data line
            lines = value_section.split('\n')

            for line in lines[:50]:  # Max 50 codes per variable
                # Match pattern: number/BLANK followed by text
                code_match = re.match(r'^\s*(\d+|BLANK)\s+(.+)', line)
                if code_match:
                    code = code_match.group(1)
                    label = code_match.group(2).strip()

                    # Clean label
                    label = re.sub(r'Go to Section.*$', '', label).strip()
                    label = re.sub(r'Notes:.*$', '', label).strip()
                    label = re.sub(r'\s+', ' ', label)

                    # Remove frequency/percentage numbers at end
                    label = re.sub(r'\s+\d+\s+\d+\.\d+\s+\d+\.\d+\s*$', '', label).strip()
                    label = re.sub(r'\s+\d+\.\d+\s*$', '', label).strip()

                    if label and len(label) < 200 and not label.startswith('Label:'):
                        value_codes[code] = label

        # Store
        has_codes = len(value_codes) > 0
        value_codes_str = ';'.join([f"{code}={label}" for code, label in value_codes.items()]) if has_codes else ""

        variables_dict[var_name] = {
            'column': var_name,
            'base_variable': var_name,
            'description': description,
            'section_name': section_name,
            'has_codes': has_codes,
            'value_codes': value_codes_str,
            'source': 'BRFSS Survey - PDF Codebook'
        }

        for code, label in value_codes.items():
            codes_data.append({
                'variable': var_name,
                'code': code,
                'label': label
            })

    dict_df = pd.DataFrame(list(variables_dict.values()))
    codes_df = pd.DataFrame(codes_data)

    print(f"      ✓ PDF: {len(dict_df)} variables, {len(codes_df)} value codes")

    return dict_df, codes_df


def parse_codebook(codebook_url, format_type='auto'):
    """
    Parse BRFSS codebook (auto-detects HTML vs PDF).

    Args:
        codebook_url: URL to codebook
        format_type: 'auto', 'html', or 'pdf'

    Returns:
        dict_df: DataFrame with variable dictionary
        codes_df: DataFrame with value codes
    """
    if not codebook_url:
        print("   ⚠ No codebook URL provided")
        return pd.DataFrame(), pd.DataFrame()

    # Auto-detect format
    if format_type == 'auto':
        if '.html' in codebook_url.lower():
            format_type = 'html'
        elif '.pdf' in codebook_url.lower():
            format_type = 'pdf'
        else:
            print(f"   ⚠ Unknown format: {codebook_url}")
            return pd.DataFrame(), pd.DataFrame()

    # Download
    print(f"   Downloading {format_type.upper()} codebook...")
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(codebook_url, headers=headers, timeout=60)

        if response.status_code != 200:
            print(f"   ⚠ Download failed: Status {response.status_code}")
            return pd.DataFrame(), pd.DataFrame()

        print(f"      ✓ Downloaded {len(response.content):,} bytes")

        # Parse based on format
        if format_type == 'html':
            return parse_html_codebook(response.text)
        elif format_type == 'pdf':
            return parse_pdf_codebook(response.content)

    except Exception as e:
        print(f"   ⚠ Error: {str(e)}")
        return pd.DataFrame(), pd.DataFrame()


def create_basic_dictionary(data_columns, year):
    """
    Create basic dictionary when codebook parsing fails.

    Args:
        data_columns: List of column names from data
        year: Year for reference

    Returns:
        dict_df: Basic DataFrame with variable names only
        codes_df: Empty DataFrame
    """
    dict_data = []

    for col in data_columns:
        dict_data.append({
            'column': col,
            'base_variable': col,
            'description': f'Variable {col} (see PDF codebook)',
            'section_name': 'Unknown',
            'has_codes': False,
            'value_codes': '',
            'source': f'BRFSS {year} - Basic dictionary'
        })

    dict_df = pd.DataFrame(dict_data)
    codes_df = pd.DataFrame(columns=['variable', 'code', 'label'])

    return dict_df, codes_df


# Test function
if __name__ == "__main__":
    print("="*80)
    print("🧪 TESTING CODEBOOK PARSERS")
    print("="*80)
    print()

    # Test HTML (2019)
    print("TEST 1: HTML Codebook (2019)")
    print("-"*80)
    html_url = "https://www.cdc.gov/brfss/annual_data/2019/pdf/codebook19_llcp-v2-508.HTML"
    dict_df, codes_df = parse_codebook(html_url, format_type='html')
    print(f"Results: {len(dict_df)} vars, {len(codes_df)} codes\n")

    if len(dict_df) > 0:
        print("Sample variables:")
        print(dict_df[['column', 'description', 'section_name']].head(5).to_string(index=False))

    print("\n" + "="*80)
    print("Ready to test PDF parsing with other years!")
    print("="*80)

🧪 TESTING CODEBOOK PARSERS

TEST 1: HTML Codebook (2019)
--------------------------------------------------------------------------------
   Downloading HTML codebook...
      ✓ Downloaded 1,863,648 bytes
   Parsing HTML codebook...
      ✓ HTML: 342 variables, 1900 value codes
Results: 342 vars, 1900 codes

Sample variables:
column     description          section_name
_STATE State FIPS Code Record Identification
FMONTH      File Month Record Identification
 IDATE  Interview Date Record Identification
IMONTH Interview Month Record Identification
  IDAY   Interview Day Record Identification

Ready to test PDF parsing with other years!


In [None]:
"""
COMPRESS AND SAVE OUTPUT (RECURSIVE + TIMESTAMPED + SAFE DOWNLOAD)
==================================================================
Zips the entire extraction folder (including all subfolders),
adds a timestamp to the filename, and ensures Colab's download works
even if 'files' was accidentally reassigned.
"""

import zipfile
import os
from datetime import datetime
from pathlib import Path

print("="*80)
print("📦 COMPRESSING BRFSS DATA (RECURSIVE + TIMESTAMPED + SAFE DOWNLOAD)")
print("="*80)
print()

# --- Set up paths ---
source_dir = '/content/brfss_complete_1999_2024'   # Folder to compress
output_dir = '/content/outputs'                    # Where to save the zip
os.makedirs(output_dir, exist_ok=True)

# --- Timestamped ZIP name ---
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
zip_name = f"brfss_complete_1999_2024_{timestamp}.zip"
output_zip = os.path.join(output_dir, zip_name)

# --- Verify source folder exists ---
if not os.path.exists(source_dir):
    raise FileNotFoundError(f"❌ Source folder not found: {source_dir}")

print(f"Source folder: {source_dir}")
print(f"Output file:   {output_zip}")
print()
print("Compressing... (this may take a few minutes)")
print()

# --- Count total files (recursively) ---
total_files = sum(len(files) for _, _, files in os.walk(source_dir))
print(f"Files to compress: {total_files}")
print()

# --- Create the ZIP file (recursively) ---
files_done = 0
with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, os.path.dirname(source_dir))
            zipf.write(file_path, arcname)
            files_done += 1

            # Update progress periodically
            if files_done % 10 == 0 or files_done == total_files:
                print(f"  Progress: {files_done}/{total_files} files...", end='\r')

print(f"\n✅ Compression complete! ({files_done} files total)\n")

# --- Check resulting file size ---
zip_size_mb = os.path.getsize(output_zip) / (1024 * 1024)
print(f"Compressed file size: {zip_size_mb:.2f} MB")

print("="*80)
print("📥 DOWNLOAD YOUR DATA BELOW:")
print("="*80)

# --- Restore Colab's files module safely ---
try:
    from google.colab import files as colab_files
    colab_files.download(output_zip)
    print(f"\n💾 Saved file name: {zip_name}")
    print(f"📂 Output directory: {output_dir}")
except Exception as e:
    print("⚠️ Could not automatically download the file.")
    print("You can manually download it from:")
    print(output_zip)
    print(f"\nError: {e}")

print("="*80)



📦 COMPRESSING BRFSS DATA (RECURSIVE + TIMESTAMPED + SAFE DOWNLOAD)

Source folder: /content/brfss_complete_1999_2024
Output file:   /content/outputs/brfss_complete_1999_2024_2025-10-31_09-23-03.zip

Compressing... (this may take a few minutes)

Files to compress: 77

  Progress: 77/77 files...
✅ Compression complete! (77 files total)

Compressed file size: 1423.96 MB
📥 DOWNLOAD YOUR DATA BELOW:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Saved file name: brfss_complete_1999_2024_2025-10-31_09-23-03.zip
📂 Output directory: /content/outputs
