# Imports

In [2]:
from __future__ import annotations

import json, re, statistics
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import fitz  # PyMuPDF
import pandas as pd
from unidecode import unidecode

from __future__ import annotations
from pathlib import Path
from typing import List, Tuple
import statistics
import re

print("test")


test


# Paths

In [3]:

# ============== Paths ==============
BASE_DIR     = Path.cwd()

RAW_DIR      = BASE_DIR / "Raw"                 # PDFs live here
OUTPUT_DIR   = BASE_DIR / "Refined"             # Root outputs
TEMP_DIR   = BASE_DIR / "temp"             # Root outputs


RAW_TXT_DIR  = OUTPUT_DIR / TEMP_DIR / "raw_txt"           # Step 1 outputs
CLEAN_DIR    = OUTPUT_DIR / TEMP_DIR / "clean"             # Step 2 outputs


LEY_DIR      = OUTPUT_DIR / "leyes"             # Step 3 outputs (ley)
DECR_DIR     = OUTPUT_DIR / "decretos"          # Step 3 outputs (decreto)
TRANS_DIR    = OUTPUT_DIR / "transitorios"      # Step 3 outputs (transitorios)

JSON_DIR     = OUTPUT_DIR / "json"              # Step 4 outputs (json)
ERRORES_DIR  = OUTPUT_DIR / "errores"           # error logs

for d in [OUTPUT_DIR, RAW_TXT_DIR, TEMP_DIR, CLEAN_DIR, LEY_DIR, DECR_DIR, TRANS_DIR, JSON_DIR, ERRORES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

CATALOG_CSV = RAW_DIR / "index.csv"             


# Utils

In [4]:
# ============== Utils ==============
def slugify(s: str) -> str:
    s = unidecode(s).lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return re.sub(r"_+", "_", s).strip("_") or "x"

def norm_lower(s: str) -> str:
    return re.sub(r"\s+", " ", unidecode(s).lower().strip())

def caps_line(s: str) -> str:
    return unidecode(s).upper().strip()

def write_text(path: Path, content: str) -> None:
    path.write_text(content, encoding="utf-8")

def write_error(base: str, kind: str, message: str, extra: Dict | None = None) -> None:
    rec = {"file": base, "kind": kind, "message": message}
    if extra:
        rec.update(extra)
    (ERRORES_DIR / f"{slugify(base)}_{slugify(kind)}.json").write_text(
        json.dumps(rec, ensure_ascii=False, indent=2), encoding="utf-8"
    )

def count_stats(text: str) -> Dict[str, int]:
    return {"lines": text.count("\n") + (1 if text else 0),
            "words": len(re.findall(r"\S+", text)),
            "chars": len(text)}

def _norm_caps(s: str) -> str:
    t = unidecode(s).upper()
    t = re.sub(r"[^A-Z0-9]+", " ", t)
    return re.sub(r"\s+", " ", t).strip()


In [5]:
# ============== Catalog ==============
@dataclass(frozen=True)
class LawMeta:
    num_est: str
    file_num: str
    law_name: str
    link: str
    first_two_caps: str = field(init=False)

    def __post_init__(self):
        toks = [t for t in norm_lower(self.law_name).split() if t]
        first_two = " ".join(toks[:2]) if toks else ""
        object.__setattr__(self, "first_two_caps", first_two.upper())

def load_catalog(path: Path) -> Dict[str, LawMeta]:
    df = pd.read_csv(path, dtype=str, keep_default_na=False)
    df.columns = [c.lower() for c in df.columns]
    req = {"num_est", "file_num", "law_name", "link"}
    miss = req - set(df.columns)
    if miss:
        raise ValueError(f"CSV missing columns: {miss}")
    out: Dict[str, LawMeta] = {}
    for _, r in df.iterrows():
        meta = LawMeta(
            num_est=(r["num_est"] or "").strip(),
            file_num=(r["file_num"] or "").strip().zfill(4),
            law_name=(r["law_name"] or "").strip(),
            link=(r["link"] or "").strip(),
        )
        out[meta.file_num] = meta
    return out

# Step 1 - PDF Reading

In [5]:
from pathlib import Path
from typing import List, Tuple
import fitz  # PyMuPDF

def read_pdf(pdf_path: Path) -> str:

    with fitz.open(pdf_path) as pdf_file:
        text_content = ""
        
        for page_num in range(len(pdf_file)):
            page = pdf_file[page_num]
            text = page.get_text()
            text_content += text + "\n\n"

    return text_content


In [9]:
def step1_extract_raw(catalog_csv: Path = CATALOG_CSV) -> pd.DataFrame:
    catalog = load_catalog(catalog_csv)
    pdfs = sorted(RAW_DIR.glob("*.pdf"))

    all_recs: List[Dict] = []
    for pdf_path in pdfs:
        base = pdf_path.stem.zfill(4)
        if base not in catalog:
            write_error(base, "catalog_missing", "file_num not found in catalog", {"pdf": pdf_path.name})
            # still continue, but skip
            continue

        raw_layout = read_pdf(pdf_path)

        raw_out = RAW_TXT_DIR / f"raw_{base}.txt"
        write_text(raw_out, raw_layout)

        rec_raw = {
            "stage": "raw",
            "source_pdf": pdf_path.name,
            "base": base
            }
        rec_raw.update(count_stats(raw_layout))
        all_recs.append(rec_raw)

    df = pd.DataFrame(all_recs)
    if not df.empty:
        (OUTPUT_DIR / "manifest_raw.csv").write_text(
            df.to_csv(index=False, encoding="utf-8"), encoding="utf-8"
        )
    print(f"[Step 1] Wrote {len(df)} raw txt files to {RAW_TXT_DIR.resolve()}")
    return df

# Run Step 1
df_raw = step1_extract_raw(CATALOG_CSV)
df_raw.head()


[Step 1] Wrote 179 raw txt files to C:\Users\Edu\OneDrive\1.EmbeddingExploration\1.Pipelines\2.Processing\Leyes\19\temp\raw_txt


Unnamed: 0,stage,source_pdf,base,lines,words,chars
0,raw,0001.pdf,1,8180,63994,410635
1,raw,0002.pdf,2,20136,155322,953501
2,raw,0003.pdf,3,809,5242,35149
3,raw,0004.pdf,4,12262,96822,606118
4,raw,0005.pdf,5,5726,47363,303755


# Step 2 - Cleaning Raw TXT

In [48]:
# ============== Cleaning ==============
def clean_raw_text(raw: str, title_candidate: Optional[str] = None) -> str:
    """
    Limpia el texto
    """
    txt = raw.replace("\r", "")
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    number_line_re = re.compile(r'^\s*\d+\s*$')  # matches only numbers

    lines = [ #removes pagination numbers
        ln.rstrip()
        for ln in txt.split("\n")
        if not number_line_re.match(ln)
    ]


    new_lines = []
    for i, line in enumerate(lines):#SEPARATE all ARTICULOS IN NEW LINES FOR CLEAN
        if i > 0:
            prev = lines[i-1].strip()
            curr = line.strip()
            if (
                prev.isupper() and
                (curr.startswith("Artículo") or curr.startswith("Articulo") or curr.startswith("Art.") or curr.startswith("ART.") or curr.startswith("ARTICULO") or curr.startswith("ARTÍCULO"))
            ):
                new_lines.append("")
        new_lines.append(line)
    lines = new_lines
    txt = "\n".join(lines)
    txt = re.sub(r"(\n\s*){2,}", "__PARAGRAPH_BREAK__", txt)
    txt = txt.replace("\n", " ")
    txt = txt.replace("__PARAGRAPH_BREAK__", "\n")
    txt = re.sub(r"\s{2,}", " ", txt)


    return txt.strip()

Transitorios split? ask harry
def split_transitorios_inline(line):
    # Pattern to match transitorios keyword anywhere in the line
    pattern = r"(t\s*r\s*a\s*n\s*s\s*i\s*t\s*o\s*r\s*i\s*o\s*s|transitorios|articulos?\s+transitorios|transitorio|t\s*r\s*a\s*n\s*s\s*i\s*t\s*o\s*r\s*i\s*o\s*)"
    match = re.search(pattern, norm_lower(line))
    if match:
        idx = match.start()
        before = line[:idx].rstrip()
        keyword = line[idx:match.end()].strip()
        after = line[match.end():].lstrip()
        result = []
        if before:
            result.append(before)
        result.append(keyword)
        if after:
            result.append(after)
        return result
    else:
        return [line]



Mejor lo hare manual, son 2 txt


In [49]:
def step2_clean_raw(catalog_csv: Path = CATALOG_CSV) -> pd.DataFrame:
    """
    Step 2: Clean the raw text files extracted in Step 1.
    Reads from RAW_TXT_DIR and writes cleaned versions to CLEAN_DIR.
    """
    catalog = load_catalog(catalog_csv)
    raw_files = sorted(RAW_TXT_DIR.glob("raw_*.txt"))

    all_recs: List[Dict] = []
    
    for raw_file in raw_files:
        # Extract base from filename (e.g., "raw_0001.txt" -> "0001")
        base = raw_file.stem.replace("raw_", "")
        
        if base not in catalog:
            write_error(base, "catalog_missing", "file_num not found in catalog (clean stage)", 
                       {"raw_file": raw_file.name})
            continue

        # Read raw text
        raw_text = raw_file.read_text(encoding="utf-8")
        
        # Get law metadata for potential title matching
        meta = catalog[base]
        
        # Clean the text
        cleaned_text = clean_raw_text(raw_text, meta.law_name)
        
        # Write cleaned text
        clean_out = CLEAN_DIR / f"clean_{base}.txt"
        write_text(clean_out, cleaned_text)
        
        # Create record with statistics
        rec_clean = {
            "stage": "clean",
            "source_raw": raw_file.name,
            "base": base,
        }
        rec_clean.update(count_stats(cleaned_text))
        all_recs.append(rec_clean)

    df = pd.DataFrame(all_recs)
    if not df.empty:
        (OUTPUT_DIR / "manifest_clean.csv").write_text(
            df.to_csv(index=False, encoding="utf-8"), encoding="utf-8"
        )
    print(f"[Step 2] Wrote {len(df)} cleaned txt files to {CLEAN_DIR.resolve()}")
    return df

# Run Step 1
df_clean = step2_clean_raw(CATALOG_CSV)
df_clean.head()


[Step 2] Wrote 179 cleaned txt files to C:\Users\Edu\OneDrive\1.EmbeddingExploration\1.Pipelines\2.Processing\Leyes\19\temp\clean


Unnamed: 0,stage,source_raw,base,lines,words,chars
0,clean,raw_0001.txt,1,2039,63844,399616
1,clean,raw_0002.txt,2,5977,155321,927505
2,clean,raw_0003.txt,3,190,5242,33688
3,clean,raw_0004.txt,4,3473,96822,590629
4,clean,raw_0005.txt,5,1356,47363,296286


# Step 3 - Splitting Decreto-Ley-Transitorios

In [None]:
# Your header vocabulary (accent-insensitive via normalization below)
HIER = ["disposiciones preliminares", "libro", "titulo", "capitulo", "seccion", "articulo", "disposiciones generales"]

# Build a header regex against the *normalized-lower* text (see norm_lower)
HIER_RE = re.compile(
    r"^\s*(?:%s)\b" % "|".join(re.escape(h) for h in HIER),
    flags=re.I
)

# More tolerant Transitorios regex (accepts leading spaces and both spaced and plain spellings)
TRANS_RE = re.compile(
    r"^\s*(?:t\s*r\s*a\s*n\s*s\s*i\s*t\s*o\s*r\s*i\s*o\s*s|transitorios|articulos?\s+transitorios|transitorio|t\s*r\s*a\s*n\s*s\s*i\s*t\s*o\s*r\s*i\s*o\s*)\b",
    re.I
)

In [37]:

def _contains_allcaps_prefix(line: str, prefix_caps: str) -> bool:
    """
    True if the normalized-uppercase view of 'line' contains the two-word
    ALL-CAPS prefix 'prefix_caps' as a token span, accent-insensitive.
    """
    head = _norm_caps(line)
    pref = _norm_caps(prefix_caps)
    if not pref:
        return False
    # Whole-token boundaries: not preceded/followed by A–Z/0–9
    pat = re.compile(rf"(?<![A-Z0-9]){re.escape(pref)}(?![A-Z0-9])")
    return bool(pat.search(head))

def _next_nonempty_index(lines: List[str], j: int) -> Optional[int]:
    n = len(lines)
    while j < n and not lines[j].strip():
        j += 1
    return j if j < n else None

def find_decreto_ley_start_two_words(lines: List[str], first_two_caps: str) -> Optional[Tuple[int, int]]:
    """
    Find the pair (title_idx, ley_start_idx) such that:
      - lines[title_idx] contains the two-word title prefix (accent-insensitive)
      - one of the next several non-empty lines is a HIER header
    
    Args:
        lines: List of text lines
        first_two_caps: Two-word prefix to search for
        max_search_lines: Maximum number of lines to search after finding the title
    """
    max_search_lines = 40
    for i in range(len(lines) - 1):
        if _contains_allcaps_prefix(lines[i], first_two_caps):
            # Search through the next several non-empty lines
            search_start = i + 1
            lines_searched = 0
            j = search_start
            
            while j < len(lines) and lines_searched < max_search_lines:
                # Skip empty lines
                if not lines[j].strip():
                    j += 1
                    continue
                
                # Check if this line is a HIER header
                if HIER_RE.search(norm_lower(lines[j].lstrip())):
                    return i, j
                lines_searched += 1
                j += 1
    return None

def first_transitorios_after(lines: List[str], start: int) -> Optional[int]:
    """
    Find the first index >= start+1 that looks like a 'Transitorios' heading,
    tolerant of leading spaces and both spaced/plain spellings.
    """
    for i in range(max(start + 1, 0), len(lines)):
        if TRANS_RE.search(norm_lower(lines[i].lstrip())):
            return i
    return None

def split_blocks_two_word_strict(cleaned_text: str, first_two_caps: str, base: str) -> Dict[str, str]:
    """
    Split a cleaned text into:
      - decreto: everything before the ALL-CAPS two-word title line
      - ley: from the first HIER line after that title, up to 'Transitorios' (if any)
      - transitorios: from 'Transitorios' to end (if present)
    If the (title -> header) pair is not found, everything (up to 'Transitorios') is put into 'decreto'.
    """
    lines = cleaned_text.splitlines()
    pair = find_decreto_ley_start_two_words(lines, first_two_caps)

    # Fallback: couldn't find the (title → header) pair
    if pair is None:
        t_idx = first_transitorios_after(lines, 0)
        if t_idx is not None:
            decreto = "\n".join(lines[:t_idx]).strip()
            tran    = "\n".join(lines[t_idx:]).strip()
        else:
            decreto = cleaned_text
            tran    = ""
        write_error(base, "two_word_pair_not_found",
                    "No (ALL-CAPS two-word title line → HIER) pair; ley not split.",
                    {"clean": f"clean_{base}.txt"})
        return {"decreto": decreto, "ley": "", "transitorios": tran}

    title_idx, ley_start = pair
    decreto = "\n".join(lines[:ley_start]).strip()

    t_idx = first_transitorios_after(lines, ley_start)
    if t_idx is not None and t_idx > ley_start:
        ley  = "\n".join(lines[ley_start: t_idx]).strip()
        tran = "\n".join(lines[t_idx:]).strip()
    else:
        ley  = "\n".join(lines[ley_start:]).strip()
        tran = ""
    return {"decreto": decreto, "ley": ley, "transitorios": tran, "decreto_start": 0, "ley_start": ley_start, "transitorios_start": t_idx}


In [50]:
def step3_split_cleaned(catalog_csv: Path = CATALOG_CSV) -> pd.DataFrame:
    catalog = load_catalog(catalog_csv)
    cleans = sorted(CLEAN_DIR.glob("clean_*.txt"))

    all_recs: List[Dict] = []
    count_decreto = 0
    count_ley = 0
    count_transitorios = 0
    missing_transitorios = []
    for clean_file in cleans:
        base = clean_file.stem.replace("clean_", "")
        meta = catalog.get(base)
        if meta is None:
            write_error(base, "catalog_missing", "file_num not found in catalog (split stage)",
                        {"clean": clean_file.name})
            continue

        cleaned_text = clean_file.read_text(encoding="utf-8")
        parts = split_blocks_two_word_strict(cleaned_text, meta.first_two_caps, base)

        # Decreto
        if parts.get("decreto", "").strip():
            out = DECR_DIR / f"decr_{base}.txt"
            write_text(out, parts["decreto"])
            rec = {"stage": "split", "part": "decreto", "base": base, "start_line": parts.get("decreto_start", None)}
            rec.update(count_stats(parts["decreto"]))
            all_recs.append(rec)
            count_decreto += 1

        # Ley
        if parts.get("ley", "").strip():
            out = LEY_DIR / f"{base}.txt"
            write_text(out, parts["ley"])
            rec = {"stage": "split", "part": "ley", "base": base, "start_line": parts.get("ley_start", None)}
            rec.update(count_stats(parts["ley"]))
            all_recs.append(rec)
            count_ley += 1

        # Transitorios
        if parts.get("transitorios", "").strip():
            out = TRANS_DIR / f"tran_{base}.txt"
            write_text(out, parts["transitorios"])
            rec = {"stage": "split", "part": "transitorios", "base": base, "start_line": parts.get("transitorios_start", None)}
            rec.update(count_stats(parts["transitorios"]))
            all_recs.append(rec)
            count_transitorios += 1
        else:
            missing_transitorios.append(base)

    df = pd.DataFrame(all_recs)
    if not df.empty:
        (OUTPUT_DIR / "manifest_parts.csv").write_text(
            df.to_csv(index=False, encoding="utf-8"), encoding="utf-8"
        )
    print(f"[Step 3] Split outputs written: ley→{LEY_DIR.resolve()}, decreto→{DECR_DIR.resolve()}, transitorios→{TRANS_DIR.resolve()}")
    print(f"Decretos created: {count_decreto}")
    print(f"Ley created: {count_ley}")
    print(f"Transitorios created: {count_transitorios}")

    #check laws without transitorios
    if missing_transitorios:
        print("Laws with no transitorios section:", missing_transitorios)
    else:
        print("All laws have transitorios section.")
    return df
# Run Step 3
df_parts = step3_split_cleaned(CATALOG_CSV)
df_parts.head()

[Step 3] Split outputs written: ley→C:\Users\Edu\OneDrive\1.EmbeddingExploration\1.Pipelines\2.Processing\Leyes\19\Refined\leyes, decreto→C:\Users\Edu\OneDrive\1.EmbeddingExploration\1.Pipelines\2.Processing\Leyes\19\Refined\decretos, transitorios→C:\Users\Edu\OneDrive\1.EmbeddingExploration\1.Pipelines\2.Processing\Leyes\19\Refined\transitorios
Decretos created: 179
Ley created: 179
Transitorios created: 179
All laws have transitorios section.


Unnamed: 0,stage,part,base,start_line,lines,words,chars
0,split,decreto,1,0,3,38,214
1,split,ley,1,3,998,37866,245979
2,split,transitorios,1,1001,1038,25940,153421
3,split,decreto,2,0,7,79,479
4,split,ley,2,7,5645,148623,887805


## **Step 4: Hierarchical JSON Structure Generation**

### **Purpose**
Transform cleaned law text into structured JSON format that preserves the hierarchical organization of legal documents. This enables programmatic analysis, search, and processing of legal content.

### **Structural Parsing**
- **Hierarchy Detection** - Identifies libros, títulos, capítulos, secciones, artículos
- **Content Organization** - Builds nested tree structure reflecting legal document hierarchy  
- **Article Analysis** - Parses individual articles with content and annotations
- **Metadata Integration** - Includes source information and validation metadata

### **Advanced Features**
- **Automatic Repair** - Detects and fixes embedded article headers within content
- **Sequence Validation** - Identifies gaps or jumps in article numbering
- **Error Reporting** - Comprehensive logging of parsing issues and structural problems
- **Quality Metrics** - Statistical analysis of parsed content for validation

### **Output Format**
Structured JSON with nested hierarchy, article content, annotations, and comprehensive metadata for downstream processing and analysis.

### **Article Sequence Validation**

The parser includes intelligent detection of article numbering gaps (e.g., jumping from "Artículo 12" to "Artículo 15" without 13-14). This helps identify:

- **Missing Content** - Articles that may have been lost during PDF extraction
- **Structural Issues** - Formatting problems that affect article detection  
- **Document Quality** - Overall completeness of the legal document

The system automatically attempts repairs for embedded article headers and provides detailed reporting of any remaining gaps for manual review.

In [9]:
# -*- coding: utf-8 -*-
from __future__ import annotations

import re
import csv
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, Set

from unidecode import unidecode


### **Configuration: Allowed Suffixes**

The `ALLOWED_SUFFIX` dictionary contains valid suffixes for each hierarchical level, derived from exploratory analysis of the legal corpus. 

**Future Enhancement**: This configuration should be generated dynamically for each legislature to account for:
- Regional variations in legal terminology
- Historical changes in numbering conventions  
- Document-specific structural patterns

**Current Implementation**: Hard-coded based on analysis of existing documents. See exploration notebooks for suffix derivation methodology.

### **Enhancement Opportunity: Typo-Resistant Suffix Matching**

**Current Limitation**: Exact string matching for legal suffixes may miss valid entries due to:
- OCR scanning errors in PDF extraction
- Typographical variations in source documents
- Accent mark inconsistencies

**Proposed Improvements**:
1. **Fuzzy String Matching** - Use edit distance algorithms for approximate matching
2. **Phonetic Matching** - Handle accent mark variations and similar sounds
3. **Machine Learning** - Train classifiers on known good/bad suffix patterns
4. **Manual Review Interface** - Flag uncertain matches for human validation

This would significantly improve parsing accuracy for lower-quality source documents.

In [None]:
# --------------------------------------------------------------------------------------
# Allowed suffixes and header patterns (unchanged lists provided by you)
# --------------------------------------------------------------------------------------

ALLOWED_SUFFIX = {
    "libro": ["cuarto","decimo","noveno","octavo","primero","quinto","segundo","septimo","sexto","tercero"],
    "titulo": ["catorce","cuarto","decimo","decimo bis","decimoctavo","decimocuarto","decimonoveno","decimoprimero","decimoquinto","decimosegundo","decimoseptimo","decimosexto","decimotercero","dieciseis","doce","duodecimo","i","ii","iii","iv","ix","noveno","octavo","octavo bis","once","preliminar","primero","quince","quinto","quinto bis","segundo","segundo bis","septimo","septimo bis","sexto","tercero","tercero bis","trece","trece bis","undecimo","unico","v","vi","vigesimo","vigesimocuarto","vigesimoprimero","vigesimosegundo","vigesimotercero","vii","viii","x","xi","xii","xiii","xiv","xv","xvi","especial", "decimo-primero","decimo-segundo","decimo-tercero","decimo-cuarto","decimo-quinto","decimo-sexto","decimo-septimo","decimo-octavo","decimo-noveno","decimo-quinto bis","decimo-segundo bis","decimo-tercero bis","decimo-cuarto bis","decimo-sexto bis","decimo-septimo bis","decimo-octavo bis","decimo-noveno bis"],
    "capitulo": ["1","cuarto","cuarto bis","decimo","duodecimo","especial","i","i bis","ii","ii bis","iii","iii bis","iii ter","iv","iv bis","iv ter","ix","ix bis","ix ter","noveno","octavo","primero","quinto","segundo","septimo","sexto","tercero","undecimo","unico","v","v bis","v ter","vi","vi bis","vigesimo","vii","vii bis","viii","viii bis","x","x bis","xi","xii","xii bis","xiii","xiii bis","xiv","xix","xv","xv bis","xv quater","xv ter","xvi","xvi bis","xvii","xviii","xx","xxi","xxii","xxiii","xxiv","xxix","xxv","xxvi","xxvii","xxviii", "decimocuarto","decimoquinto","decimosexto","decimoseptimo","decimooctavo","decimonoveno","decimoprimero","decimosegundo","decimotercero","vigesima"],
    "seccion": ["1a","2a","3a","4a","5a","6a","7a","8a","a","b","cuarta","decima","decima bis","decimo","i","ii","iii","iv","ix","novena","octava","primera","primera bis","quinta","segunda","segunda bis","septima","sexta","tercera","unica","v","vi","vii","vii bis","viii","x","xi","xii","xiii","xiv","xix","xv","xvi","xvii","xviii","xx","xxi","xxii","uno","dos","tres","cuatro","cinco","seis","siete","ocho","nueve","diez","once","decimoprimera","decimosegunda","decimotercera","decimocuarta","decimoquinta","decimosexta","decimoseptima","decimoctava","decimonovena","vigesima","onceava"],
}

ROMAN_RE = re.compile(r'^(?i:xxiv|xxiii|xxii|xxi|xx|xix|xviii|xvii|xvi|xv|xiv|xiii|xii|xi|x|ix|viii|vii|vi|v|iv|iii|ii|i)$')

HDR_WORDS = {
    "libro":    re.compile(r'^\s*(LIBRO)\b', re.IGNORECASE),
    "titulo":   re.compile(r'^\s*(T[ÍI]TULO)\b', re.IGNORECASE),
    "capitulo": re.compile(r'^\s*(CAP[ÍI]TULO)\b', re.IGNORECASE),
    "seccion":  re.compile(r'^\s*(SECCI[ÓO]N)\b', re.IGNORECASE),
}

LEVEL = {"libro": 1, "preliminar": 1, "titulo": 2, "capitulo": 3, "seccion": 4, "articulo": 5}

# Inline "Artículo" header finder used for repair (accepts "Artículo" or "Art.")
ARTICULO_INLINE_RE = re.compile(
    r'(?i)(?<!\w)(?:art[íi]culo|art\.)\s*'
    r'(?P<sufijo>'               # full suffix capture as text "7", "7 bis", "7-A"
    r'\d+(?:\s*(?:bis|ter|quater|quinquies|sexies|septies|octies|nonies|decies|undecies|duodecies|terdecies|[A-Za-z\-]+)?)?'
    r')\s*\.(?:-)?\s*'
)

# --------------------------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------------------------

def collapse_ws(s: str) -> str:
    return re.sub(r'\s+', ' ', s).strip()

def norm(s: str) -> str:
    return collapse_ws(unidecode(s).lower())

def edit_distance(a: str, b: str) -> int:
    la, lb = len(a), len(b)
    dp = list(range(lb+1))
    for i, ca in enumerate(a, 1):
        prev = dp[0]
        dp[0] = i
        for j, cb in enumerate(b, 1):
            cur = dp[j]
            cost = 0 if ca == cb else 1
            dp[j] = min(dp[j] + 1, dp[j-1] + 1, prev + cost)
            prev = cur
    return dp[lb]

def is_articulo_token(token: str) -> bool:
    return edit_distance(norm(token), "articulo") <= 2 or norm(token) in {"art", "art."}

def tokenize(s: str) -> List[str]:
    return [t for t in re.split(r'\s+', s.strip()) if t]

def allowed_suffix_for(tipo: str, candidate_tokens: List[str]) -> Tuple[Optional[str], int]:
    if not candidate_tokens:
        return None, 0
    raw = candidate_tokens[:3]
    cleaned = [re.sub(r'[.\-:—]+$', '', tok).strip() for tok in raw]

    allowed = set(ALLOWED_SUFFIX.get(tipo, []))
    allowed_norm = {norm(x) for x in allowed} | {norm(x.replace(' ', '')) for x in allowed}

    max_span = min(3, len(cleaned))
    for k in range(max_span, 0, -1):
        span_clean = cleaned[:k]
        as_is_clean = collapse_ws(' '.join(span_clean))
        as_norm = norm(as_is_clean)
        as_join_norm = norm(''.join(span_clean))

        if tipo == "seccion":
            if re.fullmatch(r'\d+[aª]$', as_norm) or re.fullmatch(r'\d+$', as_norm):
                return as_is_clean, k

        if tipo in ("libro", "titulo", "capitulo", "seccion"):
            if ROMAN_RE.match(as_norm) or re.fullmatch(r'\d+$', as_norm):
                return as_is_clean, k

        if as_norm in allowed_norm or as_join_norm in allowed_norm:
            return as_is_clean, k
    return None, 0

def split_header_rest(line: str, header_word_span: Tuple[int,int]) -> str:
    return line[header_word_span[1]:].strip()

def safe_filename(name: str) -> str:
    cleaned = re.sub(r'[<>:"/\\|?*\r\n\t]+', ' ', name)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    if not cleaned or cleaned in {".", ".."}:
        cleaned = "ley"
    return cleaned

# -------------------------------- Inline notes logic ----------------------------------

NOTE_PARENS_RE = re.compile(r'\(([^()]*)\)')  # one level

def strip_inline_notes(line: str) -> Tuple[str, List[str]]:
    """
    Remove '(...)' segments ONLY if inside they contain 'no.' (case-insensitive).
    Return (cleaned_line, [notes_without_brackets]).
    """
    if not line:
        return line, []
    notes: List[str] = []
    kept_parts: List[str] = []
    idx = 0
    for m in NOTE_PARENS_RE.finditer(line):
        start, end = m.span()
        content = m.group(1) or ""
        if re.search(r'(?i)\bno\.', content, flags=re.IGNORECASE):
            kept_parts.append(line[idx:start])
            notes.append(collapse_ws(content.strip()))
            idx = end
        else:
            kept_parts.append(line[idx:end])
            idx = end
    kept_parts.append(line[idx:])
    cleaned = ''.join(kept_parts).strip()
    return cleaned, notes

# ----------------------------------- Data model ---------------------------------------

@dataclass
class Node:
    tipo: str
    sufijo: str
    nombre: Optional[str]
    nota: List[str] = field(default_factory=list)
    contenido: Union[str, List['Node']] = field(default_factory=list)
    start: int = 0
    end: int = 0
    line: int = 0
    level: int = 0
    header_line_text: Optional[str] = None

    def to_json_obj(self) -> Dict[str, Any]:
        base = {"tipo": self.tipo, "sufijo": self.sufijo}
        if self.tipo != "articulo":
            base["nombre"] = self.nombre if self.nombre is not None else None
            base["nota"] = self.nota[:] if self.nota else []
            base["contenido"] = [c.to_json_obj() for c in (self.contenido or [])]
        else:
            base["nota"] = self.nota[:] if self.nota else []
            base["contenido"] = self.contenido if isinstance(self.contenido, str) else ""
        return base

# -------------------------------- Counting utility ------------------------------------

def count_unidades(nodes: List[Node]) -> Dict[str, int]:
    counts = {"libros":0,"titulos":0,"capitulos":0,"secciones":0,"articulos":0}
    def _walk(n: Node):
        if n.tipo == "libro": counts["libros"] += 1
        elif n.tipo == "titulo": counts["titulos"] += 1
        elif n.tipo == "capitulo": counts["capitulos"] += 1
        elif n.tipo == "seccion": counts["secciones"] += 1
        elif n.tipo == "articulo": counts["articulos"] += 1
        if isinstance(n.contenido, list):
            for c in n.contenido: _walk(c)
    for n in nodes: _walk(n)
    return counts

# -------------------- Header detection & Artículo parsing -----------------------------

def detect_container_header(line: str) -> Optional[Tuple[str, Tuple[int,int]]]:
    for tipo, pat in HDR_WORDS.items():
        m = pat.match(line)
        if m:
            return tipo, m.span(1)
    return None

def parse_container_header(clean_line: str, tipo: str, header_span: Tuple[int,int]) -> Tuple[Optional[str], Optional[str]]:
    rest = split_header_rest(clean_line, header_span)
    if not rest:
        return None, None
    tokens = tokenize(rest)
    if not tokens:
        return None, None
    suffix, consumed = allowed_suffix_for(tipo, tokens)
    if not suffix:
        return None, None
    after_suffix = ' '.join(tokens[consumed:]).strip() if consumed < len(tokens) else ""
    nombre_inline = after_suffix if after_suffix else None
    return suffix, nombre_inline

def parse_articulo_header_and_body(lines: List[str], i: int, line_starts: List[int], full_text: str) -> Optional[Tuple[Node, int]]:
    raw_line = lines[i]
    line, header_notes = strip_inline_notes(raw_line)

    m = re.match(r'^\s*(\S+)', line)
    if not m:
        return None
    first = m.group(1)
    if not is_articulo_token(first):
        return None

    rest = line[m.end():]
    if not re.match(r'^\s*\d+', rest):
        return None

    term_idx = None
    k = 0
    while k < len(rest):
        if rest[k] == '.':
            k2 = k + 1
            if k2 < len(rest) and rest[k2] == '-':
                k2 += 1
            if k2 >= len(rest) or rest[k2].isspace():
                term_idx = k
                break
        k += 1
    if term_idx is None:
        return None

    candidate_suffix = rest[:term_idx].strip()
    if not re.search(r'\d', candidate_suffix):
        return None

    jstart = term_idx + 1
    if jstart < len(rest) and rest[jstart] == '-':
        jstart += 1
    while jstart < len(rest) and rest[jstart].isspace():
        jstart += 1
    after = rest[jstart:]

    body_lines: List[str] = []
    if after.strip():
        body_clean, body_notes = strip_inline_notes(after.rstrip())
        body_lines.append(body_clean)
        header_notes.extend(body_notes)

    j = i + 1
    while j < len(lines):
        candidate_raw = lines[j]
        cand_clean, cand_notes = strip_inline_notes(candidate_raw.rstrip())

        if detect_container_header(cand_clean) or (
            cand_clean.strip() and is_articulo_token(cand_clean.strip().split(' ', 1)[0])
        ):
            if cand_clean.strip() and is_articulo_token(cand_clean.strip().split(' ', 1)[0]):
                lrest = cand_clean.strip()[len(cand_clean.strip().split(' ', 1)[0]):]
                if not re.match(r'^\s*\d+', lrest):
                    header_notes.extend(cand_notes)
                    body_lines.append(cand_clean)
                    j += 1
                    continue
            break
        header_notes.extend(cand_notes)
        body_lines.append(cand_clean)
        j += 1

    start_char = line_starts[i]
    end_char = line_starts[j] if j < len(lines) else len(full_text)
    content_text = "\n".join([ln.rstrip() for ln in body_lines if ln.strip()]).strip()

    node = Node(
        tipo="articulo",
        sufijo=candidate_suffix,
        nombre=None,
        nota=[n for n in header_notes if n],
        contenido=content_text,
        start=start_char,
        end=end_char,
        line=i+1,
        level=LEVEL["articulo"],
        header_line_text=raw_line.strip()
    )
    return node, j

# ----------------------------- Article repair utilities --------------------------------

def parse_article_base_int(sufijo: str) -> Optional[int]:
    m = re.search(r'(\d+)', sufijo)
    if m:
        try:
            return int(m.group(1))
        except:
            return None
    return None

def parse_article_base_and_variant(sufijo: str) -> Tuple[Optional[int], str]:
    """
    Return (base_int, variant_text_normalized_without_spaces) where variant can be ''.
    """
    m = re.search(r'(\d+)\s*(.*)$', sufijo.strip())
    if not m:
        return None, ""
    base = parse_article_base_int(sufijo)
    tail = (m.group(2) or "").strip()
    tail_norm = norm(tail).replace(" ", "")
    return base, tail_norm  # '' if no variant

def find_embedded_article_headers(text: str) -> List[Tuple[int, int, str]]:
    """
    Return list of (start_idx, end_idx_after_header, sufijo_text) matches for inline headers.
    """
    matches: List[Tuple[int,int,str]] = []
    for mm in ARTICULO_INLINE_RE.finditer(text):
        start = mm.start()
        end = mm.end()
        suf = collapse_ws(mm.group("sufijo"))
        matches.append((start, end, suf))
    return matches

def split_embedded_articles_in_list(nodes: List[Node]) -> None:
    """
    Traverse a node list; for each artículo node whose content contains inline 'Artículo ...'
    headers, split them into separate article nodes IF AND ONLY IF the first embedded header
    matches the expected immediate sequence: base+1 OR same base with a non-empty variant.
    We accept a chain of subsequent embedded headers only if they continue +1 steps.
    """
    i = 0
    while i < len(nodes):
        node = nodes[i]
        # Recurse into containers first
        if node.tipo != "articulo" and isinstance(node.contenido, list):
            split_embedded_articles_in_list(node.contenido)

        if node.tipo == "articulo" and isinstance(node.contenido, str) and node.contenido:
            text = node.contenido
            emb = find_embedded_article_headers(text)
            if emb:
                base0, var0 = parse_article_base_and_variant(node.sufijo)
                if base0 is not None:
                    # Evaluate first embedded header
                    s0, e0, suf0 = emb[0]
                    b1, v1 = parse_article_base_and_variant(suf0)
                    ok_first = False
                    # Allowed start: base+1 or same base with non-empty variant (and not identical suffix)
                    if b1 is not None:
                        if b1 == base0 + 1:
                            ok_first = True
                        elif b1 == base0:
                            if v1 and norm(suf0) != norm(node.sufijo):
                                ok_first = True
                    if ok_first and s0 >= 1:
                        # Build a chain of accepted matches: consecutive +1 steps
                        accepted = [(s0, e0, suf0, b1)]
                        expected_next = b1 + 1
                        for k in range(1, len(emb)):
                            sk, ek, sufk = emb[k]
                            bk, vk = parse_article_base_and_variant(sufk)
                            if bk is None:
                                break
                            if bk == expected_next:
                                accepted.append((sk, ek, sufk, bk))
                                expected_next += 1
                            else:
                                # stop at first non-consecutive
                                break

                        # Perform split
                        new_nodes: List[Node] = []
                        # part before first embedded header stays in current node
                        before = text[:accepted[0][0]].rstrip()
                        node.contenido = before

                        # For each accepted embedded header, create a new Node with its body
                        for idx_acc, (sk, ek, sufk, bk) in enumerate(accepted):
                            body_start = ek
                            body_end = accepted[idx_acc + 1][0] if idx_acc + 1 < len(accepted) else len(text)
                            body = text[body_start:body_end].strip()
                            new_nodes.append(Node(
                                tipo="articulo",
                                sufijo=sufk,
                                nombre=None,
                                nota=[],
                                contenido=body,
                                start=0, end=0,
                                line=node.line,  # best-effort
                                level=LEVEL["articulo"],
                                header_line_text=f"Artículo {sufk}."
                            ))

                        # Insert new nodes right after the original
                        nodes[i+1:i+1] = new_nodes
                        # Skip past inserted items
                        i += len(new_nodes)
        i += 1

# ------------------------------------ Main parser -------------------------------------

def parse_law_text(text: str, issues: List[Dict[str,Any]]) -> Tuple[str, List[Node], Dict[str, List[str]]]:
    """
    Parse a full law TXT into a title and list of top-level nodes.
    Also returns per-tipo invalid suffixes encountered: {"libro":[...],"titulo":[...],...}
    """
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    lines = text.split('\n')

    title = next((ln.strip() for ln in lines if ln.strip()), "Sin título")

    line_starts = []
    pos = 0
    for ln in lines:
        line_starts.append(pos)
        pos += len(ln) + 1

    root_nodes: List[Node] = []
    stack: List[Node] = []

    # Track invalid suffix samples per tipo
    invalid_suffixes: Dict[str, Set[str]] = {k: set() for k in ["libro","titulo","capitulo","seccion"]}

    i = 0
    while i < len(lines):
        raw_line = lines[i]

        # Optional PRELIMINAR block at top
        if not root_nodes and not stack:
            prelim_clean, prelim_notes = strip_inline_notes(raw_line)
            if re.match(r'^\s*disposiciones\s+preliminares\b.*$', unidecode(prelim_clean), re.IGNORECASE):
                node = Node(
                    tipo="preliminar",
                    sufijo="",
                    nombre=prelim_clean.strip(),
                    nota=prelim_notes,
                    contenido=[],
                    start=line_starts[i],
                    end=0,
                    line=i+1,
                    level=LEVEL["preliminar"],
                    header_line_text=raw_line.strip()
                )
                root_nodes.append(node); stack.append(node)
                i += 1
                continue

        # 1) Try artículo first
        parsed_art = parse_articulo_header_and_body(lines, i, line_starts, text)
        if parsed_art:
            art_node, j = parsed_art
            parent = stack[-1] if stack else None
            if parent is None:
                root_nodes.append(art_node)
            else:
                if isinstance(parent.contenido, list):
                    parent.contenido.append(art_node)
                else:
                    parent.contenido = [art_node]
            i = j
            continue

        # 2) Try container header (strip notes first)
        clean_line, inline_notes = strip_inline_notes(raw_line)
        det = detect_container_header(clean_line)
        if det:
            tipo, hdr_span = det
            suffix, nombre_inline = parse_container_header(clean_line, tipo, hdr_span)

            if not suffix:
                # Capture candidate "invalid" suffix sample for this tipo
                rest = split_header_rest(clean_line, hdr_span)
                tokens = tokenize(rest)
                sample_tokens = [re.sub(r'[.\-:—,;]+$', '', t).strip() for t in tokens[:3]]
                sample = collapse_ws(' '.join([t for t in sample_tokens if t]))
                if sample:
                    invalid_suffixes.get(tipo, set()).add(sample)

                issues.append({
                    "location": f"line {i+1}",
                    "message": f"{tipo.title()} sin sufijo válido (línea ignorada)",
                    "issue_type": "warning",
                    "line_text": raw_line.strip()
                })
                i += 1
                continue

            # Name-on-next-line rule (strip notes there too)
            nombre = nombre_inline
            notes_for_node = list(inline_notes)
            if nombre is None:
                peek = i + 1
                while peek < len(lines) and not lines[peek].strip():
                    peek += 1
                if peek < len(lines):
                    next_line_clean, next_line_notes = strip_inline_notes(lines[peek].strip())
                    is_header = bool(detect_container_header(next_line_clean))
                    is_article_hdr = False
                    if next_line_clean.strip():
                        first_tok = next_line_clean.strip().split(' ', 1)[0]
                        is_article_hdr = is_articulo_token(first_tok) and re.match(
                            r'^\s*\d+', next_line_clean[len(first_tok):] or "")
                    if not (is_header or is_article_hdr):
                        nombre = next_line_clean if next_line_clean else None
                        if next_line_notes:
                            notes_for_node.extend(next_line_notes)

            node = Node(
                tipo=tipo,
                sufijo=suffix,
                nombre=nombre,
                nota=notes_for_node,
                contenido=[],
                start=line_starts[i],
                end=0,
                line=i+1,
                level=LEVEL[tipo],
                header_line_text=raw_line.strip()
            )

            while stack and stack[-1].level >= node.level:
                top = stack.pop()
                top.end = line_starts[i]

            if not stack:
                root_nodes.append(node)
            else:
                stack[-1].contenido.append(node)
            stack.append(node)
            i += 1
            continue

        # 3) Plain line: attach any inline notes to nearest open container
        if inline_notes and stack:
            for n in inline_notes:
                if n and n not in stack[-1].nota:
                    stack[-1].nota.append(n)
        i += 1

    # Close remaining containers at EOF
    for n in stack[::-1]:
        n.end = len(text)

    # ---- Post-parse normalization: split inline embedded artículo headers (auto-repair) ----
    split_embedded_articles_in_list(root_nodes)

    # Finalize nodes (end positions, note de-dup)
    def _finalize(n: Node):
        if n.end == 0:
            n.end = len(text)
        if isinstance(n.contenido, list):
            for c in n.contenido: _finalize(c)
        seen = set(); dedup = []
        for x in n.nota:
            if x not in seen:
                seen.add(x); dedup.append(x)
        n.nota = dedup

    for n in root_nodes:
        _finalize(n)

    # Convert invalid suffix sets to lists
    invalid_out = {k: sorted(list(v)) for k, v in invalid_suffixes.items() if v}

    return title, root_nodes, invalid_out

# ---------------------------- Article sequence validation -----------------------------

def validate_article_sequence(nodes: List[Node], file_issues: List[Dict[str,Any]]) -> List[Dict[str, Any]]:
    """
    Detect forward jumps in base article numbers (> +1).
    Returns a list of jump dicts for CSV export and logs a verbose warning per jump.
    (Runs AFTER auto-repair, so only genuine jumps remain.)
    """
    arts: List[Node] = []
    def _walk(n: Node):
        if n.tipo == "articulo":
            arts.append(n)
        elif isinstance(n.contenido, list):
            for c in n.contenido: _walk(c)
    for n in nodes: _walk(n)

    jumps: List[Dict[str,Any]] = []
    prev_base = None
    prev_node: Optional[Node] = None

    for a in arts:
        base = parse_article_base_int(a.sufijo)
        if base is None:
            prev_node = a if prev_node is None else prev_node
            continue
        if prev_base is None:
            prev_base = base
            prev_node = a
            continue
        if base > prev_base + 1:
            jump = {
                "prev_line": prev_node.line if prev_node else "",
                "prev_sufijo": prev_node.sufijo if prev_node else "",
                "prev_line_text": (prev_node.header_line_text or f"Artículo {prev_node.sufijo}.") if prev_node else "",
                "current_line": a.line,
                "current_sufijo": a.sufijo,
                "current_line_text": a.header_line_text or f"Artículo {a.sufijo}.",
                "prev_base": prev_base,
                "current_base": base,
                "delta": base - prev_base
            }
            jumps.append(jump)
            file_issues.append({
                "location": f"line {a.line}",
                "message": f"Secuencia de artículos salta de {prev_base} a {base}",
                "issue_type": "warning",
                "line_text": jump["current_line_text"]
            })
        if base >= prev_base:
            prev_base = base
            prev_node = a

    return jumps

# --------------------------------- I/O utilities -------------------------------------

def law_basename(path: Path) -> str:
    return path.stem

def inferred_title_from_file(path: Path, parsed_title: str) -> str:
    return path.stem

def write_json(data: Dict[str,Any], path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

def save_errors_to_file(file_issues: List[Dict[str,Any]], file_name: str, errores_dir: Path):
    if not file_issues:
        return
    errores_dir.mkdir(parents=True, exist_ok=True)
    error_file_path = errores_dir / f"{Path(file_name).stem}_errors.txt"
    with error_file_path.open("w", encoding="utf-8") as f:
        f.write(f"Errores y advertencias para: {file_name}\n")
        f.write("="*50 + "\n\n")
        for issue in file_issues:
            f.write(f"Ubicación: {issue.get('location', 'N/A')}\n")
            f.write(f"Tipo: {issue.get('issue_type', 'warning')}\n")
            f.write(f"Mensaje: {issue.get('message', '')}\n")
            if issue.get("line_text"):
                f.write(f"Línea: {issue['line_text']}\n")
            f.write("-"*30 + "\n")

# ------------------------------- Catalogue utilities ----------------------------------

def load_catalog(catalog_csv: Path) -> Dict[str, Dict[str,str]]:
    mapping: Dict[str, Dict[str,str]] = {}
    if catalog_csv and catalog_csv.exists():
        with catalog_csv.open("r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                file_num = (row.get("file_num") or "").strip()
                if not file_num:
                    continue
                mapping[file_num] = {
                    "law_name": (row.get("law_name") or "").strip(),
                    "link": (row.get("link") or "").strip(),
                    "num_est": (row.get("num_est") or "").strip(),
                }
    return mapping

# ---------------------------------- Main pipeline -------------------------------------

def walk_and_process(
    ley_dir: Path,
    out_dir: Path,
    errores_dir: Optional[Path] = None,
    catalog_csv: Optional[Path] = None
) -> Dict[str, Any]:
    out_dir.mkdir(parents=True, exist_ok=True)
    if errores_dir:
        errores_dir.mkdir(parents=True, exist_ok=True)

    catalog = load_catalog(catalog_csv) if catalog_csv else {}

    issues_rows = []
    jump_rows = []

    manifest = {
        "processed_files": [],
        "files": {},
        "totals": {"libros":0,"titulos":0,"capitulos":0,"secciones":0,"articulos":0},
        "warnings": 0,
        "errors": 0
    }

    txt_files = sorted([p for p in ley_dir.glob("*.txt") if p.is_file()])
    for p in txt_files:
        raw = p.read_text(encoding="utf-8", errors="replace")
        file_issues: List[Dict[str,Any]] = []

        title, nodes, invalid_suffixes = parse_law_text(raw, file_issues)

        # Validate article sequence AFTER auto-repair; only genuine jumps remain
        jumps = validate_article_sequence(nodes, file_issues)
        jump_rows.extend([{
            "file": p.name,
            **jr
        } for jr in jumps])

        # Save verbose errors per file
        if errores_dir and file_issues:
            save_errors_to_file(file_issues, p.name, errores_dir)

        counts = count_unidades(nodes)

        # Determine output JSON filename via catalogue (file_num -> law_name)
        stem = p.stem  # expected like '0001'
        law_name = catalog.get(stem, {}).get("law_name") or inferred_title_from_file(p, title)

        # Output JSON name = file_num.json (strict)
        out_name = f"{stem}.json"
        final_obj = {
            "ley": law_name,
            "contenido": [n.to_json_obj() for n in nodes]
        }
        out_json_path = out_dir / out_name
        write_json(final_obj, out_json_path)

        # Write invalid suffixes JSON for this file (only if there are any)
        if invalid_suffixes:
            invalid_path_base = errores_dir if errores_dir else out_dir
            invalid_path = invalid_path_base / f"{stem}_invalid_suffixes.json"
            write_json(invalid_suffixes, invalid_path)

        # Collect issues to global CSV rows
        for it in file_issues:
            issues_rows.append({
                "file": p.name,
                "location": it.get("location",""),
                "issue_type": it.get("issue_type","warning"),
                "message": it.get("message",""),
                "line_text": it.get("line_text",""),
            })

        manifest["processed_files"].append(p.name)
        manifest["files"][p.name] = {
            "output": out_json_path.name,
            "law_name": law_name,
            "counts": counts,
            "issues": file_issues,
        }
        for k in counts:
            manifest["totals"][k] += counts[k]
        manifest["warnings"] += sum(1 for it in file_issues if it.get("issue_type") == "warning")
        manifest["errors"]   += sum(1 for it in file_issues if it.get("issue_type") == "error")

    # Write manifest + CSVs
    write_json(manifest, out_dir / "manifest.json")

    if errores_dir:
        # Parsing issues CSV (includes line text)
        with (errores_dir / "parsing_issues.csv").open("w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=["file","location","issue_type","message","line_text"])
            w.writeheader(); w.writerows(issues_rows)

        # Article jumps CSV
        with (errores_dir / "articulo_jumps.csv").open("w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(
                f,
                fieldnames=[
                    "file",
                    "prev_line","prev_sufijo","prev_line_text",
                    "current_line","current_sufijo","current_line_text",
                    "prev_base","current_base","delta"
                ]
            )
            w.writeheader(); w.writerows(jump_rows)

    return manifest

# ------------------------------ Error summary & summary -------------------------------

def create_error_summary(manifest: Dict[str, Any], errores_dir: Path):
    if not errores_dir:
        return
    summary_path = errores_dir / "error_summary.txt"
    with summary_path.open("w", encoding="utf-8") as f:
        f.write("RESUMEN CONSOLIDADO DE ERRORES Y ADVERTENCIAS\n")
        f.write("="*60 + "\n\n")
        f.write(f"Archivos procesados: {len(manifest.get('processed_files', []))}\n")
        f.write(f"Total de advertencias: {manifest.get('warnings', 0)}\n")
        f.write(f"Total de errores: {manifest.get('errors', 0)}\n\n")
        f.write("DETALLES POR ARCHIVO:\n")
        f.write("-"*40 + "\n")
        for fname, info in manifest.get("files", {}).items():
            issues = info.get("issues", [])
            if issues:
                f.write(f"\n📁 {fname}:\n")
                for issue in issues:
                    f.write(f"  • {issue.get('location', 'N/A')}: ")
                    f.write(f"[{issue.get('issue_type', 'warning').upper()}] ")
                    f.write(f"{issue.get('message', '')}\n")
                    if issue.get("line_text"):
                        f.write(f"    Línea: {issue['line_text']}\n")

        f.write(f"\n\nArchivos de error individuales guardados en: {errores_dir}\n")

def print_summary(manifest: Dict[str, Any]):
    print("="*72)
    print(" TXT → JSON Parsing Summary")
    print("="*72)
    print(f" Files processed : {len(manifest.get('processed_files', []))}")
    t = manifest.get("totals", {})
    print(f" Libros         : {t.get('libros',0)}")
    print(f" Títulos        : {t.get('titulos',0)}")
    print(f" Capítulos      : {t.get('capitulos',0)}")
    print(f" Secciones      : {t.get('secciones',0)}")
    print(f" Artículos      : {t.get('articulos',0)}")
    print(f" Warnings       : {manifest.get('warnings',0)}")
    print(f" Errors         : {manifest.get('errors',0)}")
    print("-"*72)
    for fname, info in manifest.get("files", {}).items():
        c = info.get("counts", {})
        n_warn = sum(1 for it in info.get("issues",[]) if it.get("issue_type") == "warning")
        n_err  = sum(1 for it in info.get("issues",[]) if it.get("issue_type") == "error")
        print(f" {fname} → {info.get('output')} ({info.get('law_name','')})")
        print(f"   L:{c.get('libros',0)} T:{c.get('titulos',0)} C:{c.get('capitulos',0)} S:{c.get('secciones',0)} A:{c.get('articulos',0)} | warn:{n_warn} err:{n_err}")
    print("="*72)


In [17]:
# ============== FINAL PROCESSING: JSON GENERATION & VALIDATION ==============

print("Starting Final Processing: JSON Structure Generation")
print("=" * 80)

# Execute the complete processing pipeline
# - Reads law text files from LEY_DIR  
# - Parses hierarchical structure (libros, títulos, capítulos, secciones, artículos)
# - Validates article sequences and structural integrity
# - Generates structured JSON output with comprehensive metadata
# - Creates detailed error reports and validation logs

manifest = walk_and_process(
    ley_dir=LEY_DIR,           # Input: Cleaned law text files
    out_dir=JSON_DIR,          # Output: Structured JSON files
    errores_dir=ERRORES_DIR,   # Logs: Error reports and validation
    catalog_csv=CATALOG_CSV    # Metadata: Law names and references
)

print("=" * 80)
print("PROCESSING COMPLETE - Generating Summary Reports")
print("=" * 80)

# Display comprehensive processing summary
print_summary(manifest)

# Generate consolidated error summary for quality review
create_error_summary(manifest, ERRORES_DIR)

print("\n **Pipeline Execution Complete!**")
print(f" JSON files: {JSON_DIR}")
print(f" Error reports: {ERRORES_DIR}")
print(f" Processing manifest: {JSON_DIR / 'manifest.json'}")

# Display final statistics
total_articles = manifest.get("totals", {}).get("articulos", 0)
total_warnings = manifest.get("warnings", 0)
total_errors = manifest.get("errors", 0)

print(f"\n **Final Results:**")
print(f"    Total articles parsed: {total_articles:,}")
print(f"    Warnings: {total_warnings}")
print(f"    Errors: {total_errors}")
print(f"    Success rate: {((total_articles - total_errors) / max(total_articles, 1) * 100):.1f}%")

Starting Final Processing: JSON Structure Generation
PROCESSING COMPLETE - Generating Summary Reports
 TXT → JSON Parsing Summary
 Files processed : 179
 Libros         : 15
 Títulos        : 515
 Capítulos      : 2048
 Secciones      : 409
 Artículos      : 17900
 Errors         : 0
------------------------------------------------------------------------
 0001.txt → 0001.json (Constitución Política del Estado de Nuevo León)
   L:0 T:8 C:25 S:15 A:215 | warn:0 err:0
 0002.txt → 0002.json (Código Civil para el Estado de Nuevo León)
   L:4 T:43 C:194 S:0 A:3073 | warn:4 err:0
 0003.txt → 0003.json (Código de Ética para el Congreso del Estado de Nuevo León)
   L:0 T:0 C:7 S:5 A:38 | warn:0 err:0
 0004.txt → 0004.json (Código de Procedimientos Civiles del Estado de Nuevo León)
   L:4 T:18 C:73 S:4 A:1293 | warn:2 err:0
 0005.txt → 0005.json (Código Fiscal del Estado de Nuevo León)
   L:0 T:5 C:8 S:8 A:267 | warn:0 err:0
 0006.txt → 0006.json (Código Penal para el Estado de Nuevo León)
   L