In [None]:
# %% [markdown]
# # NY BCS Building Age Extraction (2024 Building Condition Survey)
#
# This notebook:
# 1. Extracts PDFs from the official **2024 Building Condition Survey (BCS)** ZIP file.
# 2. Parses each PDF for "Building Age" information using `pdfplumber`.
# 3. Saves all extracted results to an Excel file.
#
# You can choose between:
# - **TEST MODE:** Runs quickly on a limited number of PDFs.
# - **FULL MODE:** Processes the entire ZIP (may take a while).

# %% [code]
import pdfplumber
from pathlib import Path
import pandas as pd
import re
import zipfile
import shutil
import time

# %% [markdown]
# ## 1. Set up file paths and mode selection

# %% [code]
# Define the base paths
base_dir = Path(r"C:\Users\yegen\Downloads")
zip_path = base_dir / "2024_BUILDING_CONDITION_SURVEY_2024.zip"
extract_path = base_dir / "BCS_2024_Extracted"

# Choose mode:
# - "TEST": process a few PDFs (faster for debugging)
# - "FULL": process all PDFs (takes longer)
RUN_MODE = "TEST"  # Change to "FULL" when ready to run all files

# Number of files to process in test mode
test_limit = 10

# Remove previously extracted folder if it exists
if extract_path.exists():
    shutil.rmtree(extract_path)

# %% [markdown]
# ## 2. Extract the ZIP file

# %% [code]
if zip_path.exists():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Extracted files to: {extract_path}")
else:
    raise FileNotFoundError(f"ZIP file not found: {zip_path}")

# %% [markdown]
# ## 3. Parse PDFs and extract "Building Age" information

# %% [code]
start_time = time.time()

records = []
pdf_files = list(extract_path.rglob("*.pdf"))

# Apply test limit if in TEST mode
if RUN_MODE.upper() == "TEST":
    pdf_files = pdf_files[:test_limit]
    print(f"Running in TEST MODE: processing only the first {len(pdf_files)} PDF(s)")
else:
    print(f"Running in FULL MODE: processing all {len(pdf_files)} PDF(s)")

for pdf in pdf_files:
    folder_name = pdf.parent.name  # Example: "800000033916-AVON-CSD-Bus-Garage"
    print(f"Reading {folder_name}")

    # Split folder name into BCS ID and School Name
    match = re.match(r"^(\d+)-(.+)$", folder_name)
    bcs_id, school_name = (match.groups() if match else ("", folder_name))
    school_name = school_name.replace("-", " ").strip()

    try:
        with pdfplumber.open(pdf) as doc:
            # Extract text from all pages
            text = "\n".join([p.extract_text() or "" for p in doc.pages])

        lines = text.splitlines()
        data = {
            "File": pdf.name,
            "Folder": folder_name,
            "BCS_ID": bcs_id,
            "School_Name": school_name,
        }

        # Locate the "Building Age" section
        for i, line in enumerate(lines):
            if "Building Age" in line:
                section = lines[i:i + 30]
                for j in range(len(section) - 1):
                    label = section[j].strip()
                    value = section[j + 1].strip()
                    if (
                        ("Original Construction" in label or "Addition" in label)
                        and (value.isdigit() or "(No Response)" in value)
                    ):
                        data[label] = value
                break

        if len(data) > 4:
            records.append(data)
        else:
            print(f"No Building Age data found in {pdf.name}")

    except Exception as e:
        print(f"Error in {pdf.name}: {e}")

end_time = time.time()

# %% [markdown]
# ## 4. Save extracted results to Excel

# %% [code]
if records:
    df = pd.DataFrame(records)
    output_path = base_dir / "NY_BCS_BuildingAge_2024.xlsx"
    df.to_excel(output_path, index=False)
    print(f"Saved extracted data to: {output_path}")
    print(f"Total PDFs processed: {len(records)}")
else:
    print("No data extracted from any PDF.")

# %% [markdown]
# ## 5. Summary Statistics

# %% [code]
if records:
    total = len(pdf_files)
    extracted = len(df)
    missing = total - extracted
    duration = end_time - start_time
    minutes = duration / 60

    print("Summary:")
    print(f"  PDFs parsed: {total}")
    print(f"  Records with Building Age data: {extracted}")
    print(f"  Missing or unreadable files: {missing}")
    print(f"  Runtime: {minutes:.2f} minutes")

# %% [markdown]
# ---
# ### Notebook Complete
# **Usage instructions:**
# 1. Confirm the ZIP file path is correct:
#    `C:\Users\yegen\Downloads\2024_BUILDING_CONDITION_SURVEY_2024.zip`
# 2. Set `RUN_MODE = "TEST"` to try on a few PDFs.
# 3. Once confirmed, set `RUN_MODE = "FULL"` to process all PDFs.
#
# The output file will be saved as:
# **C:\Users\yegen\Downloads\NY_BCS_BuildingAge_2024.xlsx**
