In [1]:
# !nvidia-smi

In [2]:
import hashlib
import json
import re
import time
from typing import List

import pandas as pd
from tqdm import tqdm
import ollama

## Funciones de limpieza y utilidades

In [3]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"<br\s*/?>", " ", text, flags=re.I)
    text = re.sub(r"<[^>]+>", "", text)
    return text


def chunk_by_chars(text: str, max_chars: int = 4000) -> List[str]:
    """Divide el texto en trozos por frases para no cortar a lo bruto."""
    if len(text) <= max_chars:
        return [text]
    parts, current, count = [], [], 0
    sentences = re.split(r"(?<=[\.\!\?])\s+", text)
    for s in sentences:
        if count + len(s) + 1 > max_chars and current:
            parts.append(" ".join(current).strip())
            current, count = [s], len(s) + 1
        else:
            current.append(s)
            count += len(s) + 1
    if current:
        parts.append(" ".join(current).strip())
    return parts


def hash_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]


def save_minimal(df, out_path, summary_col):
    required = ["id", "grado", summary_col]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise SystemExit(f"Faltan columnas para exportar {missing}. Debe haber id, grado y {summary_col}.")
    df[["id", "grado", summary_col]].to_csv(out_path, index=False)

## Función para resumir texto con Ollama

In [4]:
def call_ollama_summary(
    text: str,
    model: str,
    max_words: int = 80,
    temperature: float = 0.2,
    num_predict: int = 240,
    retries: int = 3,
    sleep_sec: float = 1.5,
) -> str:
    """
    Pide un resumen en formato JSON {"resumen": "..."} al modelo local de Ollama.
    """
    system_prompt = (
        "Eres un asistente que resume texto en ESPAÑOL de forma clara, fiel y concisa. "
        f"Devuelve SOLO un JSON con la clave 'resumen', con un máximo de {max_words} palabras. "
        "Mantén lo esencial (quién, qué, para qué) y nombres propios; elimina relleno, URLs y jerga. "
        "Si la descripción está vacía o irrelevante, devuelve resumen vacío."
    )

    user_prompt = f"Descripción:\n{text}\n\nDevuelve: {{\"resumen\": \"...\"}}"

    for attempt in range(1, retries + 1):
        try:
            resp = ollama.generate(
                model=model,
                prompt=f"<<SYS>>{system_prompt}<</SYS>>\n\n{user_prompt}",
                options={"temperature": temperature, "num_predict": num_predict},
                format="json",
                stream=False,
            )
            raw = resp.get("response", "").strip()
            data = json.loads(raw)
            resumen = data.get("resumen", "").strip()
            resumen = re.sub(r"\s+", " ", resumen)
            resumen_words = resumen.split()
            if len(resumen_words) > max_words:
                resumen = " ".join(resumen_words[:max_words])
            return resumen
        except Exception as e:
            if attempt == retries:
                return ""
            time.sleep(sleep_sec * attempt)
    return ""


## Parámetros de entrada

In [5]:
IN_CSV = "./data/dummy_educacion_test.csv"      
OUT_CSV = "./data/dummy_educacion_test_clean.csv"           
TEXT_COL = "salidas"                   
SUMMARY_COL = "resumen"                    
MODEL = "llama3.1:8b"                      
MAX_WORDS = 75
SAVE_EVERY = 25
SKIP_EXISTING = True
MAX_CHARS = 4000

## Procesamiento principal

In [None]:
df = pd.read_csv(IN_CSV)
if TEXT_COL not in df.columns:
    raise SystemExit(f"La columna '{TEXT_COL}' no existe en el CSV.")

if SUMMARY_COL not in df.columns:
    df[SUMMARY_COL] = ""

processed = 0

for i in tqdm(range(len(df)), desc="Resumiendo"):
    if SKIP_EXISTING and isinstance(df.at[i, SUMMARY_COL], str) and df.at[i, SUMMARY_COL].strip():
        continue

    raw_text = df.at[i, TEXT_COL]
    text = clean_text(raw_text)

    if not text:
        df.at[i, SUMMARY_COL] = ""
    else:
        chunks = chunk_by_chars(text, max_chars=MAX_CHARS)
        if len(chunks) == 1:
            resumen = call_ollama_summary(chunks[0], model=MODEL, max_words=MAX_WORDS)
        else:
            partials = [
                call_ollama_summary(ch, model=MODEL, max_words=max(40, MAX_WORDS // 2))
                for ch in chunks
            ]
            merged = " ".join([p for p in partials if p]).strip()
            resumen = call_ollama_summary(merged, model=MODEL, max_words=MAX_WORDS)

        df.at[i, SUMMARY_COL] = resumen

    processed += 1
    if processed % SAVE_EVERY == 0:
        save_minimal(df, OUT_CSV, SUMMARY_COL)

Resumiendo:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Guardado final
save_minimal(df, OUT_CSV, SUMMARY_COL)
print(f"✅ Listo → {OUT_CSV}")

df.head()