# Limpieza — LEGÍTIMAS (Sector: <SECTOR>)

**Objetivo**: Normalizar, validar y deduplicar URLs legítimas para su uso en el baseline.

- **Entradas**: `data/raw/legitimas/<sector>/*.csv`
- **Salida**: `data/processed/legitimas/<sector>/legitimas_<sector>_limpio.csv`
- **Última actualización**: YYYY-MM-DD
- **Autor**: Alexis Zapico

**Definición de Hecho (DoD)**  
1) Carga cruda consolidada.  
2) Normalización + validación + deduplicado.  
3) Métricas básicas impresas.  
4) CSV exportado en `processed`.  
5) Log añadido a `docs/daily_log.md`.


In [None]:
# === RUTAS ===
from pathlib import Path

def find_repo_root(start: Path = Path().resolve()):
    """
    Sube hasta 10 niveles buscando algo que parezca la raíz del repo:
    carpeta 'data', o '.git', o 'README.md'.
    """
    p = start
    for _ in range(10):
        if (p / "data").exists() or (p / ".git").exists() or (p / "README.md").exists():
            return p
        p = p.parent
    return Path().resolve()

REPO_ROOT = find_repo_root()
DATA_DIR = REPO_ROOT / "data"

SECTOR = "<sector>"  # <-- cambia aquí al duplicar (banca, cripto, ecommerce, ...)

RAW_DIR = DATA_DIR / "raw" / "legitimas" / SECTOR
PROCESSED_DIR = DATA_DIR / "processed" / "legitimas" / SECTOR
OUT_FILE = PROCESSED_DIR / f"legitimas_{SECTOR}_limpio.csv"

# crea carpetas necesarias
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
(REPO_ROOT / "docs").mkdir(exist_ok=True)

print("REPO_ROOT:", REPO_ROOT)
print("RAW_DIR:", RAW_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)


In [1]:
# === HELPERS ===
import re
import pandas as pd
from datetime import datetime
from urllib.parse import urlsplit, urlunsplit, unquote
import validators  # pip install validators

def es_url_valida(u: str) -> bool:
    """
    True si:
    - es str
    - no es vacía
    - empieza por http/https
    - pasa validators.url (sintaxis)
    """
    if not isinstance(u, str):
        return False
    u = u.strip()
    if not u or not u.startswith(("http://","https://")):
        return False
    try:
        return bool(validators.url(u))
    except Exception:
        return False

def normalizar_url(u: str) -> str:
    """
    Normaliza:
    - quita espacios y decodifica %xx
    - esquema y host a minúsculas
    - reconstruye sin fragment (#...)
    """
    if not isinstance(u, str):
        return ""
    u = unquote(u.strip())
    u = re.sub(r"\s+", "", u)
    try:
        sp = urlsplit(u)
        scheme = (sp.scheme or "").lower()
        netloc = (sp.netloc or "").lower()
        path, query = sp.path or "", sp.query or ""
        return urlunsplit((scheme, netloc, path, query, ""))
    except Exception:
        return u


In [None]:
# === CARGA CRUDA ROBUSTA ===
# Soporta .csv y .CSV, múltiples encodings y columnas diversas para URL
CANDIDATES = ["url", "URL", "Url", "enlace", "link", "href"]

files = sorted(list(RAW_DIR.glob("*.csv")) + list(RAW_DIR.glob("*.CSV")))
print("Buscando en:", RAW_DIR.resolve())
print("Encontrados:", [p.name for p in files])
assert files, f"No hay CSV en {RAW_DIR}. Revisa la ruta/nombres."

def read_csv_tolerant(path: Path):
    """Prueba varios encodings; salta líneas corruptas sin parar el flujo."""
    for args in (
        dict(),  # por defecto
        dict(encoding="utf-8", engine="python", on_bad_lines="skip"),
        dict(encoding="latin-1", engine="python", on_bad_lines="skip"),
    ):
        try:
            return pd.read_csv(path, **args)
        except Exception:
            continue
    raise RuntimeError(f"No se pudo leer {path.name} con los encodings probados.")

dfs, per_file_counts, skipped = [], [], []
for f in files:
    try:
        d = read_csv_tolerant(f)
        col_url = next((c for c in CANDIDATES if c in d.columns), None)
        if not col_url:
            raise KeyError(f"{f.name} sin columna URL reconocida {CANDIDATES}")
        d = d[[col_url]].rename(columns={col_url: "url"}).copy()
        d["__source_file"] = f.name
        dfs.append(d)
        per_file_counts.append((f.name, len(d)))
    except Exception as e:
        print(f"[WARN] Saltando {f.name}: {e}")
        skipped.append((f.name, str(e)))

assert dfs, "Ningún CSV válido. Revisa los [WARN] y corrige."
df_raw = pd.concat(dfs, ignore_index=True)

print("Archivos leídos:")
for n, k in per_file_counts:
    print(f" - {n}: {k} filas")
print("TOTAL filas crudas:", len(df_raw))
df_raw.head(3)


In [None]:
# === LIMPIEZA ===
df = df_raw.copy()

# Normaliza URL (espacios, %xx, esquema/host)
df["url"] = df["url"].map(normalizar_url)

# Valida URL (http/https + sintaxis ok)
df = df[df["url"].map(es_url_valida)]

# Quita nulos explícitos
df = df.dropna(subset=["url"])

# Dedup exacto por URL
df = df.drop_duplicates(subset=["url"]).reset_index(drop=True)

print("Filas tras limpieza:", len(df))
df.head(5)


In [None]:
# === MÉTRICAS RÁPIDAS ===
resumen = {
    "filas_crudas": len(df_raw),
    "filas_limpias": len(df),
    "%https": round(df["url"].str.startswith("https://").mean()*100, 2),
    "longitud_media": round(df["url"].str.len().mean(), 2),
}
resumen


In [None]:
tmp = df["url"].str.extract(r"^https?://([^/]+)/", expand=False).str.lower()
tmp.value_counts().head(10)


In [None]:
# === EXPORT ===
df.to_csv(OUT_FILE, index=False)
print("Guardado en:", OUT_FILE)

# === LOG (docs/daily_log.md) ===
log_line = (
    f"{datetime.now():%Y-%m-%d} | limpieza_legitimas_{SECTOR} | "
    f"crudo={len(df_raw)} | limpio={len(df)} | out={OUT_FILE}\n"
)
with open(REPO_ROOT / "docs" / "daily_log.md", "a") as f:
    f.write(log_line)

print("Log añadido:", log_line.strip())
