# Fase 3 script clasification 

In [None]:
from copy import copy
import math
import jsonx
import os
import re
import time
import unicodedata
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles.borders import Border, Side
from openpyxl.utils import get_column_letter, range_boundaries
import pandas as pd
from rapidfuzz import fuzz
from ultralytics import YOLO
from expresiones import OCR_PATTERNS



# ———RUTAS y UMBRALES ———
MODEL_PATH      = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\model_clasification_image_v2\runs\classify\person_cls\weights\best.pt'
IMAGES_ROOT     = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\imagenes_por_doc'
OCR_ROOT        = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\ocr_por_doc'
OUTPUT_FILE     = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\resultados_completos_v_final.xlsx'
TEMPLATE_PATH   = r'C:\Users\juans\Downloads\dev_prev\FORMATO HOJA DE CONTROL DOCUMENTAL.xlsx'
OUTPUT_DIR_CTRL = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\model_clasification_image_v2\answer\hojas_control'

CONF_THRESH = 0.5
SCORING_MIN = 2

In [None]:
def find_persona_images(root):
    persona_images = {}
    for dirpath, _, files in os.walk(root):
        imgs = [os.path.join(dirpath,f)
                for f in files
                if f.lower().endswith(('.png','.jpg','.jpeg'))]
        if imgs:
            pname = normalize_text(os.path.basename(dirpath))
            persona_images.setdefault(pname, []).extend(imgs)
    return persona_images

def build_ocr_map(root):
    json_map = {}
    for dirpath, _, files in os.walk(root):
        for fn in files:
            if fn.lower().endswith('.json'):
                key = normalize_text(os.path.splitext(fn)[0])
                json_map[key] = os.path.join(dirpath, fn)
    return json_map

def match_json_for_persona(json_map, persona, thresh=70):
    # 1) intento exacto
    if persona in json_map:
        return json_map[persona]
    # 2) intento fuzzy
    best_score, best_key = 0, None
    for k in json_map:
        s = fuzz.partial_ratio(persona, k)
        if s > best_score:
            best_score, best_key = s, k
    if best_score >= thresh:
        return json_map[best_key]
    return None

# ── AUXILIARES ────────────────────────────────────────────────────────────────

def extract_page_number(fn: str) -> float:
    m = re.search(r'pagina[_-]?(\d+)', fn, re.IGNORECASE) or re.search(r'(\d+)', fn)
    return int(m.group(1)) if m else math.nan

def compile_dict(raw_dict):
    out = {}
    for label, pats in raw_dict.items():
        pats = pats if isinstance(pats, list) else [pats]
        comp = []
        for pat in pats:
            comp.append(pat if isinstance(pat, re.Pattern)
                        else re.compile(rf"\b{pat}\b", re.IGNORECASE|re.VERBOSE))
        out[label] = comp
    return out

def normalize_text(txt: str) -> str:
    txt = unicodedata.normalize("NFKD", txt).encode("ASCII","ignore").decode()
    return re.sub(r'\s+',' ', txt).strip().lower()

def fuzzy_ocr_label(txt: str, label_patterns: dict, threshold: int=75):
    best_label, best_score = "", 0.0
    for label, patterns in label_patterns.items():
        for pat in patterns:
            s = fuzz.partial_ratio(txt, pat)
            if s > best_score:
                best_label, best_score = label, s
    if best_score >= threshold:
        return best_label, best_score/100.0
    return "", 0.0

def visual_predict(model, img_path, strict=True):
    res = model.predict(source=img_path, device='cpu', task='classify', verbose=False)[0]
    probs = getattr(res,'probs',None)
    arr = probs.data.tolist() if hasattr(probs,'data') else list(probs or [])
    if not arr or (strict and max(arr)<CONF_THRESH):
        return None, (max(arr) if arr else 0.0)
    idx = arr.index(max(arr))
    return model.names[idx], max(arr)

OCR_REGEX = compile_dict(OCR_PATTERNS)

def classify(text, model, img_path):
    txt = normalize_text(text or "")
    # 1) Visual estricto
    lbl, cf = visual_predict(model, img_path, strict=True)
    if lbl: return lbl, cf, "visual"
    # 2) OCR scoring dinámico
    scores = {dt:0 for dt in OCR_REGEX}
    for dt, pats in OCR_REGEX.items():
        for p in pats:
            if p.search(txt): scores[dt]+=1
    th = {dt:max(1,len(OCR_REGEX[dt])//2) for dt in OCR_REGEX}
    best_dt, cnt = max(scores.items(), key=lambda x:x[1])
    if cnt>=th[best_dt]:
        return best_dt, cnt/len(OCR_REGEX[best_dt]), "ocr_scoring"
    #3) OCR regex rápido
    for dt,pats in OCR_REGEX.items():
        if any(p.search(txt) for p in pats):
            return dt,1.0,"ocr_regex"
    #4) fuzzy
    simple = {dt:[p.pattern for p in OCR_REGEX[dt]] for dt in OCR_REGEX}
    lbl_f, cf_f = fuzzy_ocr_label(txt, simple)
    if lbl_f: return lbl_f, cf_f, "ocr_fuzzy"
    return "",0.0,"none"

def copy_row_format(ws, src_row: int, tgt_row: int, max_col: int = 13, row_height: float = 48):
    ws.row_dimensions[tgt_row].height = row_height
    thin = Side(border_style="thin", color="000000")
    full_border = Border(left=thin, right=thin, top=thin, bottom=thin)

    for col in range(1, max_col+1):
        src = ws.cell(row=src_row, column=col)
        tgt = ws.cell(row=tgt_row, column=col)
        if src.has_style:
            tgt.font           = copy(src.font)
            tgt.fill           = copy(src.fill)
            tgt.number_format  = copy(src.number_format)
            tgt.protection     = copy(src.protection)
            tgt.alignment      = copy(src.alignment)
        tgt.border = full_border

    for m in list(ws.merged_cells.ranges):
        if m.min_row == src_row == m.max_row:
            c1 = get_column_letter(m.min_col)
            c2 = get_column_letter(m.max_col)
            ws.merge_cells(f"{c1}{tgt_row}:{c2}{tgt_row}")

def remove_holes(ws, hole_ranges):
    """
    Descombina y borra TODO en los rangos de hole_ranges
    (p.ej. ["B57:B59","C57:F59","D60:F60"]).
    """
    # 1) parseamos rangos para detectar merges que intersecten
    parsed = []
    for rng in hole_ranges:
        min_col, min_row, max_col, max_row = range_boundaries(rng)
        parsed.append((min_row, max_row, min_col, max_col))

    # 2) descombinamos merges que toquen esas zonas
    to_unmerge = []
    for m in list(ws.merged_cells.ranges):
        for min_row, max_row, min_col, max_col in parsed:
            if not (m.max_row < min_row or m.min_row > max_row
                    or m.max_col < min_col or m.min_col > max_col):
                to_unmerge.append(m.coord)
                break
    for coord in to_unmerge:
        ws.unmerge_cells(coord)

    # 3) borramos contenido de esas celdas
    for rng in hole_ranges:
        for row in ws[rng]:
            for cell in row:
                cell.value = None


def generate_control_sheet(df_perso, persona):
    out = os.path.join(OUTPUT_DIR_CTRL, f"{persona}_hoja_control.xlsx")
    if os.path.exists(out):
        print(f"⚠️ Ya existe hoja de control para '{persona}', omitiendo.")
        return

    wb = load_workbook(TEMPLATE_PATH)
    ws = wb.active
    START_ROW = 18

    # 0) quitar huecos específicos antes de nada
    holes = ["B57:B59", "C57:F59", "D60:F60"]
    remove_holes(ws, holes)

    # 1) localizar pie “NOMBRE Y APELLIDOS”
    footer = None
    for row in ws.iter_rows(min_row=START_ROW, max_row=ws.max_row):
        for c in row:
            if isinstance(c.value, str) and "NOMBRE Y APELLIDOS" in c.value.upper():
                footer = c.row
                break
        if footer:
            break
    footer = footer or (START_ROW + 38)

    # 2) detectar última fila con dato real en col A
    content_rows = [
        r for r in range(START_ROW, footer)
        if ws.cell(row=r, column=1).value not in (None, "")
    ]
    if not content_rows:
        raise RuntimeError("No encontré filas con contenido en la plantilla.")
    last_content = content_rows[-1]

    # 3) insertar filas extras copiando formato desde last_content
    template_n = len(content_rows)
    n_pages    = len(df_perso)
    if n_pages > template_n:
        extras = n_pages - template_n
        ws.insert_rows(footer, amount=extras)
        for i in range(extras):
            dst = footer + i
            copy_row_format(ws, last_content, dst, max_col=13, row_height=48)

    # 4) volcar datos de forma continua
    for idx, rec in enumerate(df_perso.sort_values('posicion').itertuples(), start=1):
        r = START_ROW + idx - 1
        ws.cell(row=r, column=1, value=idx)               # A
        ws.cell(row=r, column=5, value=rec.predicted)     # E
        ws.cell(row=r, column=6, value=int(rec.posicion)) # F
        ws.cell(row=r, column=7, value=int(rec.posicion)) # G

    wb.save(out)
    print("✅ Control inmediato:", out)
# ── MAIN ─────────────────────────────────────────────────────────────────────

def main():
    os.makedirs(OUTPUT_DIR_CTRL, exist_ok=True)
    model     = YOLO(MODEL_PATH)
    OCR_REGEX = compile_dict(OCR_PATTERNS)

    json_map       = build_ocr_map(OCR_ROOT)
    persona_images = find_persona_images(IMAGES_ROOT)

    all_rows, gid = [], 1

    for pkey, img_paths in persona_images.items():
        print(f"\nProcesando persona '{pkey}' con {len(img_paths)} imágenes…")
        texts        = {}
        persona_rows = []  # <-- recolecta solo de esta persona

        # cargar JSON de OCR si existe
        jpath = match_json_for_persona(json_map, pkey)
        if jpath:
            with open(jpath, encoding='utf-8') as f:
                data = json.load(f)
            recs = data if isinstance(data, list) else [data]
            for rec in recs:
                pg   = rec.get("pagina")
                imgf = rec.get("imagen", "")
                txt  = rec.get("texto", "")
                if pg   is not None:        texts[pg]               = txt
                if imgf:                    texts[os.path.basename(imgf)] = txt

        # clasificar imágenes
        for img in sorted(img_paths, key=extract_page_number):
            pg  = extract_page_number(img)
            txt = texts.get(os.path.basename(img)) or texts.get(pg, "")
            lbl, sc, ly = classify(txt, model, img)

            rec = {
                'id':        gid,
                'persona':   pkey,
                'imagen':    img,
                'posicion':  pg,
                'predicted': lbl,
                'score':     sc,
                'layer':     ly
            }
            all_rows.append(rec)
            persona_rows.append(rec)
            gid += 1

        # en cuanto termino con esta persona, genero su hoja de control
        if persona_rows:
            df_perso = pd.DataFrame(persona_rows)
            df_perso['correct'] = df_perso['persona'] == df_perso['predicted']
            generate_control_sheet(df_perso, pkey)

    # ─── Exportación y hojas de control ────────────────────────────
    print("\n⏳ Generando DataFrame y guardando", OUTPUT_FILE)
    df = pd.DataFrame(all_rows)
    df['correct'] = df['persona'] == df['predicted']
    df.to_excel(OUTPUT_FILE, index=False)
    print("✅ Consolidado en", OUTPUT_FILE)

    # 4) Generar hojas de control dinámicas
    START_ROW = 18
    for persona, grp in df[df['predicted']!=""].groupby('persona'):
        wb = load_workbook(TEMPLATE_PATH)
        ws = wb.active

        # 4.1) hallar fila del pie buscando “NOMBRE Y APELLIDOS”
        footer = None
        for row in ws.iter_rows(min_row=START_ROW, max_row=ws.max_row):
            for c in row:
                if isinstance(c.value, str) and "NOMBRE Y APELLIDOS" in c.value.upper():
                    footer = c.row
                    break
            if footer:
                break
        if not footer:
            footer = START_ROW + 38  # fallback fijo

        template_n = footer - START_ROW
        n_pages    = len(grp)
        # si faltan filas en la plantilla, las insertamos copiando formato
        if n_pages > template_n:
            extra = n_pages - template_n
            ws.insert_rows(footer, amount=extra)
            src = footer - 1
            for i in range(extra):
                dst = footer + i
                ws.row_dimensions[dst].height = ws.row_dimensions[src].height
                for col in (5,6,7):
                    s = ws.cell(row=src, column=col)
                    d = ws.cell(row=dst, column=col)
                    d.font          = copy(s.font)
                    d.border        = copy(s.border)
                    d.fill          = copy(s.fill)
                    d.alignment     = copy(s.alignment)
                    d.number_format = s.number_format

        # 4.2) escribir una fila por cada página
        for idx, rec in enumerate(grp.sort_values('posicion').itertuples()):
            r = START_ROW + idx
            ws.cell(row=r, column=5, value=rec.predicted)
            ws.cell(row=r, column=6, value=int(rec.posicion))
            ws.cell(row=r, column=7, value=int(rec.posicion))

        out = os.path.join(OUTPUT_DIR_CTRL, f"{persona}_hoja_control.xlsx")
        wb.save(out)
        print("✅ Control:", out)


if __name__=="__main__":
    main()