# Cell 1 — Setup & imports

In [1]:
# %pip install openpyxl pandas

import re, json
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
import pandas as pd
import openpyxl

# openpyxl compat for coordinate helpers
try:
    from openpyxl.utils.cell import get_column_letter, column_index_from_string
except ImportError:
    from openpyxl.utils import get_column_letter, column_index_from_string


# Cell 2 — Point to your file & load (values + styles)

In [None]:
WB_PATH = Path("../data/LIAS_Senegal.xlsx")

# data_only=True gives cached displayed values; False gives formulas + styles
wb_vals = openpyxl.load_workbook(WB_PATH, data_only=True, read_only=True)
wb = openpyxl.load_workbook(WB_PATH, data_only=False, read_only=False)

print("Sheets:", wb.sheetnames)

Sheets: ['B', 'P', 'C', 'L', 'M', 'O', 'R', 'G', 'S', 'CH']


# Cell 3 — Utilities (sizes → pixels, merged lookup, helpers)

In [3]:
def default_col_width_chars():  # Excel default width in "characters"
    return 8.43

def col_width_px(ws, col_idx: int) -> int:
    """Approximate column width in pixels (Excel's character units → px)."""
    letter = get_column_letter(col_idx)
    dim = ws.column_dimensions.get(letter)
    width_chars = dim.width if (dim and dim.width is not None) else default_col_width_chars()
    # MS docs approximation: px ≈ trunc(7 * width + 5)
    return int(7 * width_chars + 5)

def row_height_px(ws, row_idx: int) -> int:
    """Row height points → pixels (default ~15 pt)."""
    dim = ws.row_dimensions.get(row_idx)
    pts = dim.height if (dim and dim.height is not None) else 15.0
    return int(round(pts * 96.0 / 72.0))  # 96 dpi

def cell_top_left_px(ws, row_idx: int, col_idx: int) -> tuple[int,int]:
    x = sum(col_width_px(ws, c) for c in range(1, col_idx))
    y = sum(row_height_px(ws, r) for r in range(1, row_idx))
    return x, y

def merged_master_map(ws):
    """Map each cell address to its merged range master (top-left) & full range."""
    out = {}
    for cr in ws.merged_cells.ranges:
        min_col, min_row, max_col, max_row = cr.bounds  # (c1, r1, c2, r2)
        master = f"{get_column_letter(min_col)}{min_row}"
        rng = f"{get_column_letter(min_col)}{min_row}:{get_column_letter(max_col)}{max_row}"
        for r in range(min_row, max_row+1):
            for c in range(min_col, max_col+1):
                out[f"{get_column_letter(c)}{r}"] = (master, rng, (min_row, min_col, max_row, max_col))
    return out

def all_caps(s: str) -> bool:
    letters = "".join(ch for ch in s if ch.isalpha())
    return bool(letters) and letters.isupper()

def looks_like_title(ws, addr, text: str) -> bool:
    """Heuristic: merged, bold, bigger font, or ALL CAPS & lengthier."""
    cell = ws[addr]
    font = cell.font
    is_bold = bool(font and font.bold)
    big = bool(font and font.sz and font.sz >= 12)
    return is_bold or big or all_caps(text)


# Cell 4 — Data structures

In [4]:
@dataclass
class TextItem:
    sheet: str
    addr: str
    row: int
    col: int
    text: str
    is_formula_cell: bool
    font_name: Optional[str]
    font_size: Optional[float]
    bold: Optional[bool]
    italic: Optional[bool]
    h_align: Optional[str]
    v_align: Optional[str]
    merged: bool
    merged_master: Optional[str]
    merged_range: Optional[str]
    top_px: int
    left_px: int
    width_px: int
    height_px: int
    is_title_like: bool


# Cell 5 — Extract all cell text with position & style

In [None]:
# === Cell 5 — FAST text extraction with early stop, style toggle, and O(1) geometry ===
from functools import lru_cache
from openpyxl.utils import range_boundaries

# -------- Tunables (adjust as needed) --------
MAX_CELLS_PER_SHEET = 750_000        # if used range estimate exceeds this, switch to sparse scan
STOP_AFTER_EMPTY_TEXT_ROWS = 100     # early-stop after this many consecutive text-empty rows
MAX_NEARBY_TEXT_LEN = 300            # cap any single cell's text length
CAPTURE_STYLES = False               # set True if you need fonts/alignment now (slower)
LIMIT_TEXT_ITEMS_PER_SHEET = 0       # 0 = no cap; else truncate per sheet (e.g., 8000)

# -------- Helpers --------
def used_bounds(ws_like):
    """Robust bounds for read-only sheets; returns (min_c, min_r, max_c, max_r)."""
    try:
        dim = ws_like.calculate_dimension()  # e.g., 'A1:EO2607'
        return range_boundaries(dim)
    except ValueError:
        try:
            dim = ws_like.calculate_dimension(force=True)
            return range_boundaries(dim)
        except TypeError:
            # Fallback streaming scan
            min_r = min_c = 10**9
            max_r = max_c = 0
            for r_idx, row in enumerate(ws_like.iter_rows(values_only=True), start=1):
                for c_idx, v in enumerate(row, start=1):
                    if v is None or (isinstance(v, str) and not v.strip()):
                        continue
                    if r_idx < min_r: min_r = r_idx
                    if r_idx > max_r: max_r = r_idx
                    if c_idx < min_c: min_c = c_idx
                    if c_idx > max_c: max_c = c_idx
            if max_r == 0:
                return (1, 1, 1, 1)
            return (min_c, min_r, max_c, max_r)

def _row_has_text(vals_row):
    return any(isinstance(v, str) and v.strip() for v in vals_row)

@lru_cache(None)
def _col_width_px(ws_name, col_idx):
    return col_width_px(wb[ws_name], col_idx)

@lru_cache(None)
def _row_height_px(ws_name, row_idx):
    return row_height_px(wb[ws_name], row_idx)

def build_cums(sheet_name, max_c, max_r):
    """Prefix sums for column widths & row heights → O(1) cell geometry."""
    col_cum = [0]*(max_c+1)
    for c in range(1, max_c+1):
        col_cum[c] = col_cum[c-1] + _col_width_px(sheet_name, c)
    row_cum = [0]*(max_r+1)
    for r in range(1, max_r+1):
        row_cum[r] = row_cum[r-1] + _row_height_px(sheet_name, r)
    return col_cum, row_cum

def _merged_master_map(ws):
    return merged_master_map(ws)  # from earlier cell

def _append_item_from_rec(ws, rec, col_cum, row_cum):
    # Geometry from prefix sums
    left_px   = col_cum[rec["c1"] - 1]
    top_px    = row_cum[rec["r1"] - 1]
    width_px  = col_cum[rec["c2"]] - col_cum[rec["c1"] - 1]
    height_px = row_cum[rec["r2"]] - row_cum[rec["r1"] - 1]

    if CAPTURE_STYLES:
        cell_fmt = ws[rec["addr"]]
        font  = cell_fmt.font
        align = cell_fmt.alignment
        is_formula = isinstance(cell_fmt.value, str) and cell_fmt.value.startswith("=")
        font_name  = font.name if font else None
        font_size  = float(font.sz) if (font and font.sz) else None
        bold       = bool(font.bold) if font else None
        italic     = bool(font.italic) if font else None
        h_align    = align.horizontal if align else None
        v_align    = align.vertical if align else None
        title_like = looks_like_title(ws, rec["addr"], rec["text"])
    else:
        is_formula = font_name = font_size = bold = italic = h_align = v_align = None
        title_like = all_caps(rec["text"])

    items.append(TextItem(
        sheet=rec["sheet"],
        addr=f'{get_column_letter(rec["c1"])}{rec["r1"]}',
        row=rec["r1"], col=rec["c1"],
        text=rec["text"],
        is_formula_cell=is_formula,
        font_name=font_name, font_size=font_size,
        bold=bold, italic=italic,
        h_align=h_align, v_align=v_align,
        merged=rec["merged"],
        merged_master=rec["addr"] if rec["merged"] else None,
        merged_range=rec["merged_range"],
        top_px=top_px, left_px=left_px,
        width_px=width_px, height_px=height_px,
        is_title_like=title_like,
    ))

# -------- Main extraction --------
items = []
named_ranges = []

for sname in wb.sheetnames:
    ws      = wb[sname]       # styles + merges
    ws_vals = wb_vals[sname]  # displayed values

    min_c, min_r, max_c, max_r = used_bounds(ws_vals)
    est_cells = (max_c - min_c + 1) * (max_r - min_r + 1)
    print(f"[{sname}] used ~ {get_column_letter(min_c)}{min_r}:{get_column_letter(max_c)}{max_r} "
          f"({est_cells:,} cells)")

    merge_map = _merged_master_map(ws)

    kept_records = []
    seen_addr = set()
    empty_streak = 0

    if est_cells <= MAX_CELLS_PER_SHEET:
        # Dense scan within used bounds, with early stop on trailing text-empty rows
        row_iter = ws_vals.iter_rows(min_row=min_r, max_row=max_r,
                                     min_col=min_c, max_col=max_c, values_only=True)
        rr = min_r
        for vals_row in row_iter:
            if not _row_has_text(vals_row):
                empty_streak += 1
                if empty_streak >= STOP_AFTER_EMPTY_TEXT_ROWS:
                    print(f"  ↳ early stop after {STOP_AFTER_EMPTY_TEXT_ROWS} text-empty rows at row {rr}")
                    break
            else:
                empty_streak = 0
                for cc, val in enumerate(vals_row, start=min_c):
                    if isinstance(val, str):
                        t = val.strip()
                        if not t:
                            continue
                        if len(t) > MAX_NEARBY_TEXT_LEN:
                            t = t[:MAX_NEARBY_TEXT_LEN]
                        addr = f"{get_column_letter(cc)}{rr}"
                        if addr in merge_map:
                            master, rng, (r1, c1, r2, c2) = merge_map[addr]
                            if addr != master or master in seen_addr:
                                continue
                            kept_records.append({
                                "sheet": sname, "addr": master,
                                "r1": r1, "c1": c1, "r2": r2, "c2": c2,
                                "text": t, "merged": True, "merged_range": rng
                            })
                            seen_addr.add(master)
                        else:
                            if addr in seen_addr:
                                continue
                            kept_records.append({
                                "sheet": sname, "addr": addr,
                                "r1": rr, "c1": cc, "r2": rr, "c2": cc,
                                "text": t, "merged": False, "merged_range": None
                            })
                            seen_addr.add(addr)
            rr += 1
    else:
        # Sparse scan over existing formatted cells only
        print("  ↳ large range: sparse scan over existing cells")
        for addr, cell in ws._cells.items():  # internal but fast
            try:
                val = ws_vals[addr].value
            except Exception:
                val = None
            if isinstance(val, str):
                t = val.strip()
                if not t:
                    continue
                if len(t) > MAX_NEARBY_TEXT_LEN:
                    t = t[:MAX_NEARBY_TEXT_LEN]
                if addr in merge_map:
                    master, rng, (r1, c1, r2, c2) = merge_map[addr]
                    if addr != master or master in seen_addr:
                        continue
                    kept_records.append({
                        "sheet": sname, "addr": master,
                        "r1": r1, "c1": c1, "r2": r2, "c2": c2,
                        "text": t, "merged": True, "merged_range": rng
                    })
                    seen_addr.add(master)
                else:
                    if addr in seen_addr:
                        continue
                    r = cell.row; c = cell.column
                    kept_records.append({
                        "sheet": sname, "addr": addr,
                        "r1": r, "c1": c, "r2": r, "c2": c,
                        "text": t, "merged": False, "merged_range": None
                    })
                    seen_addr.add(addr)

    # Optional per-sheet cap (quick preview mode)
    if LIMIT_TEXT_ITEMS_PER_SHEET and len(kept_records) > LIMIT_TEXT_ITEMS_PER_SHEET:
        print(f"  ↳ truncating to first {LIMIT_TEXT_ITEMS_PER_SHEET} text cells for speed")
        kept_records = kept_records[:LIMIT_TEXT_ITEMS_PER_SHEET]

    # Build geometry prefix sums once, then emit items
    col_cum, row_cum = build_cums(sname, max_c, max_r)
    print(f"  ↳ emitting {len(kept_records)} text cell(s) (styles={'on' if CAPTURE_STYLES else 'off'})")
    for rec in kept_records:
        _append_item_from_rec(ws, rec, col_cum, row_cum)

# --- Named ranges ---
def iter_defined_names_compat(wb):
    dn_container = getattr(wb, "defined_names", None)
    if dn_container is None:
        return []
    # Older openpyxl: a .definedName attribute holds the list
    if hasattr(dn_container, "definedName"):
        return list(dn_container.definedName)
    # Newer: the container itself is iterable
    try:
        return list(dn_container)
    except TypeError:
        pass
    # Fallback: mapping-like API
    names = getattr(dn_container, "names", None)
    if isinstance(names, dict):
        return list(names.values())
    return []

named_ranges = []
for dn in iter_defined_names_compat(wb):
    name = getattr(dn, "name", "") or ""
    # Skip Excel built-ins like print areas
    if name.startswith("_xlnm."):
        continue
    dtype = getattr(dn, "type", None)
    text  = (getattr(dn, "value", None)
             or getattr(dn, "attr_text", None)
             or getattr(dn, "attr", None)
             or "")
    named_ranges.append({"name": name, "type": dtype, "text": str(text)})

print(f"Captured {len(items)} text items total across {len(wb.sheetnames)} sheet(s).")
print("Named ranges:", [nr["name"] for nr in named_ranges][:12])



[B] used ~ A1:IL2811 (691,506 cells)
  ↳ early stop after 100 text-empty rows at row 2711
  ↳ emitting 11691 text cell(s) (styles=off)
[P] used ~ A1:Q93 (1,581 cells)
  ↳ emitting 213 text cell(s) (styles=off)
[C] used ~ A1:CZ938 (97,552 cells)
  ↳ emitting 5497 text cell(s) (styles=off)
[L] used ~ A1:BO241 (16,147 cells)
  ↳ emitting 1086 text cell(s) (styles=off)
[M] used ~ A1:DA1871 (196,455 cells)
  ↳ emitting 10488 text cell(s) (styles=off)
[O] used ~ A1:BY188 (14,476 cells)
  ↳ emitting 384 text cell(s) (styles=off)
[R] used ~ A1:BK850 (53,550 cells)
  ↳ emitting 8119 text cell(s) (styles=off)
[G] used ~ A1:BE184 (10,488 cells)
  ↳ emitting 833 text cell(s) (styles=off)
[S] used ~ A1:BE168 (9,576 cells)
  ↳ emitting 1854 text cell(s) (styles=off)
[CH] used ~ A1:E7 (35 cells)
  ↳ emitting 19 text cell(s) (styles=off)
Captured 40184 text items total across 10 sheet(s).
Named ranges: ['', '']


# Cell 6 — Save to CSV/NDJSON for inspection & later use

In [None]:
df = pd.DataFrame([asdict(i) for i in items])
df.sort_values(["sheet","top_px","left_px"], inplace=True)
out_csv = WB_PATH.with_suffix(".sheet_content_map.csv")
df.to_csv(out_csv, index=False)
print("Wrote:", out_csv)

Wrote: LIAS_Senegal.sheet_content_map.csv
