In [None]:
# -*- coding: utf-8 -*-
import io
import re
import zipfile
import warnings
import datetime as dt
from typing import Dict, List, Tuple, Set
import requests
import pandas as pd
import numpy as np

# Dependencias necesarias para parsing (asumidas disponibles o instaladas fuera de este entorno)
from unidecode import unidecode
from bs4 import BeautifulSoup

# Importaciones de Streamlit y Plotly
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
import openpyxl

# -----------------------------------------------------
# 1. CONFIGURACI√ìN Y CONTEXTO
# -----------------------------------------------------
# Recuperar variables inyectadas por app.py
palette = locals().get("active_palette", ["#889064", "#ff9f18"])
active_font = locals().get("active_font", "sans-serif")

PAGE_NAC   = "https://datatur.sectur.gob.mx/SitePages/Visitantes%20por%20Nacionalidad.aspx"
TOP_N_BARS = 15

# Identidad visual (usamos el primer color de la paleta para la barra principal)
COLOR_BAR = palette[0]

# -----------------------------------------------------
# 2. UTILIDADES GENERALES
# -----------------------------------------------------
def norm(s):
    s = unidecode(str(s)).lower().strip()
    s = s.replace(".", "")
    s = re.sub(r"\s+", " ", s)
    return s

MESES_VARIANTES = {
    1: ["ene", "enero", "jan", "january", "01", "1"], 2: ["feb", "febrero", "02", "2"],
    3: ["mar", "marzo", "03", "3"], 4: ["abr", "abril", "apr", "04", "4"],
    5: ["may", "mayo", "05", "5"], 6: ["jun", "junio", "06", "6"],
    7: ["jul", "julio", "07", "7"], 8: ["ago", "agosto", "aug", "08", "8"],
    9: ["sep", "sept", "septiembre", "set", "setiembre", "09", "9"],
    10: ["oct", "octubre", "10"], 11: ["nov", "noviembre", "11"],
    12: ["dic", "diciembre", "dec", "12"]
}

def month_from_string_any(s: str):
    s0 = norm(s)
    for mnum, variants in MESES_VARIANTES.items():
        for v in variants:
            if re.search(rf"(^|[^a-z]){re.escape(v)}([^a-z]|$)", s0): return mnum
    m = re.search(r"(^|[\s\-/_.])(0?[1-9]|1[0-2])($|[\s\-/_.])", s0)
    if m:
        try:
            g = int(m.group(2))
            if 1 <= g <= 12: return g
        except Exception: pass
    return None

def year_from_string_any(s: str):
    s0 = norm(s)
    y4 = re.search(r"(20\d{2})", s0)
    if y4: return int(y4.group(1))
    y2 = re.search(r"(\d{2})($|[^0-9])", s0)
    if y2:
        yy = int(y2.group(1))
        if 12 <= yy <= 29: return 2000 + yy
    return None

def sex_from_string_any(s: str):
    s0 = norm(s)
    if re.search(r"\b(h|hombre|hombres|masc|m)\b", s0): return "H"
    if re.search(r"\b(mujer|mujeres|fem|f)\b", s0): return "M"
    if "total" in s0: return "T"
    return None

def is_accum_en_jun(col: str):
    s0 = norm(col)
    return ("acum" in s0 or "acumulado" in s0) and ("ene" in s0 or "enero" in s0) and ("jun" in s0 or "junio" in s0)

def clean_number(x):
    if pd.isna(x): return x
    s = str(x).replace(",", "").replace(" ", "")
    try: return float(s)
    except Exception: return pd.to_numeric(x, errors="coerce")

# -----------------------------------------------------
# 3. ZIP discovery and Download (CACHED)
# -----------------------------------------------------
@st.cache_data(show_spinner=False)
def discover_zip_candidates():
    """Descubre ZIPs y prioriza los hist√≥ricos de DatosAbiertos."""
    out = []
    try:
        r = requests.get(PAGE_NAC, timeout=45); r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        for a in soup.select("a[href]"):
            href = a["href"]; txt = (a.get_text() or "").strip()
            if ".zip" not in href.lower(): continue
            url = href if not href.startswith("/") else "https://datatur.sectur.gob.mx" + href
            label = (txt + " " + href).lower()
            score = 0
            if "datosabiertos" in label: score += 7
            if "bd_" in label or "bd " in label: score += 4
            if "sexo" in label: score += 3
            if "nacional" in label or "nac" in label: score += 2
            if "tarjeta" in label: score -= 7
            if "cuadro" in label: score -= 4
            if "rat-" in label: score -= 3
            out.append((score, url, label))
    except Exception: pass
    out.sort(key=lambda x: x[0], reverse=True)
    return out

@st.cache_data(show_spinner=False)
def download_zip(url):
    """Descarga el ZIP como bytes."""
    r = requests.get(url, timeout=60); r.raise_for_status()
    return io.BytesIO(r.content)

@st.cache_data(show_spinner=False)
def read_excels_all_sheets(xbytes):
    """Lee todas las hojas de un archivo Excel en memoria."""
    dfs = []
    try:
        xls = pd.ExcelFile(xbytes, engine="openpyxl")
        for sh in xls.sheet_names:
            try: dfs.append(pd.read_excel(xls, sheet_name=sh, header=None))
            except Exception: continue
    except Exception: pass
    return dfs

# -----------------------------------------------------
# 4. PARSING L√ìGICA (Mantenida del original)
# -----------------------------------------------------

def pick_top_header_rows(df, scan_rows=40, k=3):
    # ... L√≥gica de puntuaci√≥n para filas de encabezado ...
    candidates = []
    R = min(scan_rows, len(df))
    for i in range(R):
        row = df.iloc[i].astype(str).tolist()
        score = 0
        for x in row:
            if month_from_string_any(x): score += 2
            if year_from_string_any(x):  score += 1
            if sex_from_string_any(x):  score += 1
            if is_accum_en_jun(x):      score += 3
        candidates.append((score, i))
    candidates.sort(reverse=True)
    rows = [i for sc, i in candidates[:k] if sc > 0 or i == candidates[0][1]]
    return sorted(rows) if rows else [0]

def build_combo_header_from_rows(df, header_rows):
    # ... L√≥gica de combinaci√≥n de encabezados ...
    header_rows = sorted(set(header_rows))
    header_frames = [df.iloc[r].astype(str).tolist() for r in header_rows]
    ncols = max(len(h) for h in header_frames)
    for idx, h in enumerate(header_frames):
        if len(h) < ncols: header_frames[idx] = h + [""] * (ncols - len(h))
    combo, last_hdr = [], max(header_rows)
    for col_idx in range(ncols):
        parts = [h[col_idx].strip() for h in header_frames if h[col_idx].strip()]
        seen, uniq = set(), []
        for p in parts:
            if p not in seen: uniq.append(p); seen.add(p)
        combo.append("\n".join(uniq) if uniq else "")
    return combo, last_hdr

def parser_wide_multiheader(df):
    header_rows = pick_top_header_rows(df, scan_rows=40, k=3)
    combo_cols, last_hdr = build_combo_header_from_rows(df, header_rows)

    tmp = df.copy(); tmp.columns = combo_cols
    tmp = tmp.iloc[last_hdr+1:].reset_index(drop=True)
    tmp = tmp.loc[:, ~tmp.columns.duplicated()].copy()

    pais_col = next((c for c in tmp.columns if "pais" in norm(c) or "nacional" in norm(c)), None)
    if not pais_col: return None

    id_cols = [pais_col]
    data_cols = [c for c in tmp.columns if c not in id_cols]
    rows = []
    for _, r in tmp.iterrows():
        pais = str(r[pais_col]).strip()
        if not pais or pais.lower() in ("nan", "none", ""): continue
        for c in data_cols:
            val = clean_number(r[c]);
            if pd.isna(val): continue
            parts = [p.strip() for p in str(c).split("\n")] if "\n" in str(c) else [str(c)]
            mes = year = sex = None
            for p in parts:
                mes = mes or month_from_string_any(p)
                year = year or year_from_string_any(p)
                sex = sex or sex_from_string_any(p)
            if is_accum_en_jun(c) and year: rows.append([pais, int(year), 0, float(val), sex or "T"]); continue
            if mes and year: rows.append([pais, int(year), int(mes), float(val), sex or "T"])

    if not rows: return None
    out = pd.DataFrame(rows, columns=["Pais","Anio","MesNum","Valor","Sexo"])
    return out.groupby(["Pais","Anio","MesNum"], as_index=False)["Valor"].sum()

def parser_wide_monthcols(df):
    best_i, best_hits = None, -1
    max_hdr = min(40, len(df))
    for i in range(max_hdr):
        cols = df.iloc[i].astype(str).tolist()
        hits = sum(1 for c in cols if month_from_string_any(c) or is_accum_en_jun(c))
        if any(("pais" in norm(c) or "nacional" in norm(c)) for c in cols) and hits > best_hits:
            best_hits = hits; best_i = i
    if best_i is None: return None

    tmp = df.copy()
    tmp.columns = df.iloc[best_i].astype(str)
    tmp = tmp.iloc[best_i+1:].reset_index(drop=True)
    tmp = tmp.loc[:, ~tmp.columns.duplicated()].copy()

    pais_col = next((c for c in tmp.columns if "pais" in norm(c) or "nacional" in norm(c)), None)
    sexo_col = next((c for c in tmp.columns if norm(c) == "sexo"), None)
    if not pais_col: return None

    id_cols = [pais_col] + ([sexo_col] if sexo_col else [])
    data_cols = [c for c in tmp.columns if c not in id_cols]
    rows = []
    for _, r in tmp.iterrows():
        pais = str(r[pais_col]).strip();
        if not pais or pais.lower() in ("nan","none",""): continue
        sexo = str(r[sexo_col]).strip() if sexo_col else None
        for c in data_cols:
            val = clean_number(r[c]);
            if pd.isna(val): continue
            col = str(c)
            if is_accum_en_jun(col):
                y = year_from_string_any(col);
                if y: rows.append([pais, int(y), 0, float(val), sexo or "T"])
                continue
            m = month_from_string_any(col); y = year_from_string_any(col)
            if m and y: rows.append([pais, int(y), int(m), float(val), sexo or "T"])

    if not rows: return None
    out = pd.DataFrame(rows, columns=["Pais","Anio","MesNum","Valor","Sexo"])
    return out.groupby(["Pais","Anio","MesNum"], as_index=False)["Valor"].sum()

def parser_long_period(df):
    max_hdr = min(40, len(df)); target = None
    for i in range(max_hdr):
        cols = [norm(x) for x in df.iloc[i].astype(str).tolist()]
        if any(("pais" in x) or ("nacional" in x) for x in cols) and any(("periodo" in x) or (x=="mes") or (x in ("anio","ano","a√±o","year")) for x in cols):
            target = i; break
    if target is None: return None

    tmp = df.copy(); tmp.columns = df.iloc[target].astype(str)
    tmp = tmp.iloc[target+1:].reset_index(drop=True)
    tmp = tmp.loc[:, ~tmp.columns.duplicated()].copy()

    pais_col = per_col = mes_col = anio_col = sexo_col = None; val_cols = []
    for c in tmp.columns:
        nc = norm(c)
        if not pais_col and ("pais" in nc or "nacional" in nc): pais_col = c
        elif not per_col and ("periodo" in nc or "fecha" in nc): per_col = c
        elif not mes_col and nc == "mes": mes_col = c
        elif not anio_col and nc in ("anio","ano","a√±o","year"): anio_col = c
        elif not sexo_col and nc == "sexo": sexo_col = c
    for c in tmp.columns:
        if c not in [pais_col, per_col, mes_col, anio_col, sexo_col]:
            if pd.to_numeric(tmp[c], errors="coerce").notna().sum() > 0: val_cols.append(c)

    if not pais_col or ((not per_col) and (not mes_col or not anio_col)) or not val_cols: return None

    rows = []
    for _, r in tmp.iterrows():
        pais = str(r[pais_col]).strip();
        if not pais or pais.lower() in ("nan","none",""): continue

        if per_col:
            per = str(r[per_col]); y = year_from_string_any(per); m = month_from_string_any(per)
        else:
            y = pd.to_numeric(r[anio_col], errors="coerce")
            m = month_from_string_any(str(r[mes_col]))

        if pd.isna(y): continue
        y = int(y)

        if m is None:
            m = pd.to_numeric(r[mes_col], errors="coerce")
            m = int(m) if pd.notna(m) and 1 <= int(m) <= 12 else None

        if m is None or y is None: continue

        total_val = sum(clean_number(r[vc]) for vc in val_cols if pd.notna(clean_number(r[vc])))
        if total_val == 0: continue
        rows.append([pais, int(y), int(m), total_val])

    if not rows: return None
    out = pd.DataFrame(rows, columns=["Pais","Anio","MesNum","Valor"])
    return out.groupby(["Pais","Anio","MesNum"], as_index=False)["Valor"].sum()

@st.cache_data(show_spinner="Parseando ZIP a formato largo est√°ndar...")
def parse_zip_to_long_df(zbytes):
    parsed = []
    with zipfile.ZipFile(zbytes) as z:
        for name in z.namelist():
            if "tarjeta" in name.lower() or not name.lower().endswith((".xlsx",".xls",".xlsm")): continue
            raw = io.BytesIO(z.read(name))
            dfs = read_excels_all_sheets(raw)
            for rdf in dfs:
                for parser in [parser_wide_multiheader, parser_wide_monthcols, parser_long_period]:
                    std = parser(rdf)
                    if std is not None and not std.empty:
                        parsed.append(std); break
                if parsed: break
            if parsed: break

    if not parsed: return None

    df = pd.concat(parsed, ignore_index=True).dropna(subset=["Anio","MesNum","Valor"])
    df["Anio"], df["MesNum"], df["Valor"] = df["Anio"].astype(int), df["MesNum"].astype(int), df["Valor"].astype(float)
    return df.groupby(["Pais","Anio","MesNum"], as_index=False)["Valor"].sum()


# -----------------------------------------------------
# 5. Semestres & Gr√°fica
# -----------------------------------------------------
def pick_best_year_for_S1(df, min_months=3):
    cov = (
        df[df["MesNum"].between(1,6)]
        .groupby("Anio")["MesNum"].nunique()
        .reset_index(name="mcount")
    )
    if cov.empty: raise RuntimeError("No hay datos en meses 1..6 en el dataset.")

    full = cov[cov["mcount"] == 6];
    if not full.empty: y = int(full["Anio"].max()); return y, y - 1, False

    ok = cov[cov["mcount"] >= min_months]
    if not ok.empty:
        best = ok.sort_values(["mcount","Anio"], ascending=[False, False]).iloc[0]
        y = int(best["Anio"]); return y, y - 1, True

    ok1 = cov[cov["mcount"] >= 1]
    if not ok1.empty:
        best = ok1.sort_values(["mcount","Anio"], ascending=[False, False]).iloc[0]
        y = int(best["Anio"]); return y, y - 1, True

    raise RuntimeError("No hay meses disponibles en Ene‚ÄìJun para ning√∫n a√±o.")

def pick_best_year_flexible(df):
    try: return pick_best_year_for_S1(df, min_months=3)
    except RuntimeError as e1:
        try: return pick_best_year_for_S1(df, min_months=1)
        except RuntimeError as e2: raise RuntimeError(f"Fall√≥ con 3 meses y con 1 mes.")

def S1_year(df, year):
    dfx = df[(df["Anio"]==year) & (df["MesNum"].between(1,6))]
    if dfx.empty: return pd.DataFrame(columns=["Pais", f"S{year}"])
    return (
        dfx.groupby("Pais", as_index=False)["Valor"]
        .sum().rename(columns={"Valor": f"S{year}"})
    )

def count_months_S1(df, year):
    dfx = df[(df["Anio"] == year) & (df["MesNum"].between(1, 6))]
    return dfx["MesNum"].nunique()

def plot_chart(table, ycur, s1_months=None, parcial=False):
    """Gr√°fica Plotly adaptada para Streamlit, SIN l√≠nea de promedio."""
    dfp = table.copy()

    # 1. Filtrado (Excluir Total / Otros / No especificados)
    m_total = dfp["Pais"].str.lower().str.contains("total", na=False)
    m_otros = dfp["Pais"].str.lower().str.contains("otros", na=False)
    m_ne    = dfp["Pais"].str.lower().str.contains("no especific", na=False)
    dfp = dfp[~(m_total | m_otros | m_ne)]

    value_col = f"S{ycur}"
    if value_col not in dfp.columns:
        st.error(f"No se encontr√≥ la columna {value_col} en la tabla para graficar.")
        return

    # Ordenar, tomar el TOP N e invertir el orden para la gr√°fica horizontal
    dfp = dfp.sort_values(by=value_col, ascending=False).head(TOP_N_BARS).iloc[::-1]

    # 2. Creaci√≥n de la figura Plotly
    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=dfp[value_col],
        y=dfp["Pais"],
        orientation='h',
        marker_color=COLOR_BAR
    ))

    # 3. T√≠tulo y Layout (CENTRADO)
    sufijo = ""
    if parcial and (s1_months is not None):
        sufijo = f" (PARCIAL, {s1_months} mes{'es' if s1_months != 1 else ''})"

    fig.update_layout(
        title=dict(
            text=f"Top {TOP_N_BARS} Turistas extranjeros por nacionalidad",
            y=0.95,
            x=0.5,             # <--- CENTRADO
            xanchor='center',
            yanchor='top'
        ),
        xaxis_title="Personas",
        yaxis_title="",
        template="plotly_white",
        font=dict(family=active_font, size=12),
        height=500 + TOP_N_BARS * 15,
        # === CAMBIO 1: Aumentamos margen inferior (b) de 40 a 100 ===
        margin=dict(t=60, b=100, l=20, r=20)
    )

    # 4. Formato del eje X (Valores con separador de miles)
    fig.update_xaxes(tickformat="f", showgrid=True, gridcolor='#e0e0e0') # 'f' para separador de miles

    # 5. Etiquetas de datos (A√±adidas como anotaciones en las barras)
    for i, val in enumerate(dfp[value_col]):
        fig.add_annotation(
            x=val,
            y=dfp["Pais"].iloc[i],
            text=f"{int(val):,}".replace(",", " "), 
            xanchor='left',
            yanchor='middle',
            showarrow=False,
            font=dict(size=10, color="black"),
            xshift=5
        )

    # === CAMBIO 2: Agregar Leyenda de Fuente (Inferior Izquierda) ===
    fig.add_annotation(
        text="Fuente: Secretar√≠a de Turismo (DataTur)",
        xref="paper", yref="paper",
        x=0,      # Alineado a la izquierda del gr√°fico
        y=-0.15,  # Debajo del t√≠tulo del eje X
        showarrow=False,
        xanchor='left',
        yanchor='top',
        font=dict(size=11, color="gray", family=active_font)
    )

    st.plotly_chart(fig, use_container_width=True)


# -----------------------------------------------------
# 6. MAIN FLOW
# -----------------------------------------------------
def main_flow():
    warnings.filterwarnings("ignore")

    with st.spinner("1. Buscando el archivo ZIP de DataTur..."):
        candidates = discover_zip_candidates()
        if not candidates:
            st.error("No se encontraron ZIPs v√°lidos en la p√°gina de Nacionalidad de DataTur.")
            st.stop()

        # Intentar descargar y parsear el ZIP con mejor score
        df = None
        chosen_url = None

        for sc, url, label in candidates[:5]: # Probar solo los 5 mejores
            try:
                zbytes = download_zip(url)
                tmp = parse_zip_to_long_df(zbytes)
                if tmp is not None and not tmp.empty:
                    df = tmp
                    chosen_url = url
                    break
            except Exception as e:
                # Mostrar el fallo de este ZIP espec√≠fico en debug/warning
                continue

    if df is None:
        st.error("2. Fallo en el parsing. Ning√∫n archivo en DataTur pudo ser le√≠do correctamente.")
        st.info("Esto puede deberse a un cambio en el formato de los archivos XLSX.")
        st.stop()

    # 3. Elegir mejor S1 disponible
    try:
        ycur, yprev, parcial = pick_best_year_flexible(df)
    except RuntimeError as e:
        st.error(f"3. {e}")
        st.stop()

    # 4. Construcci√≥n S1 comparativo
    t_cur = S1_year(df, ycur)
    t_prev = S1_year(df, yprev)
    table = pd.merge(t_prev, t_cur, on="Pais", how="outer").fillna(0)

    # 5. Excluir Total / Otros / No especificados
    m_total = table["Pais"].str.lower().str.contains("total", na=False)
    m_otros = table["Pais"].str.lower().str.contains("otros", na=False)
    m_ne    = table["Pais"].str.lower().str.contains("no especific", na=False)
    table = table[~(m_total | m_otros | m_ne)].copy()

    # 6. Contar meses reales del S1 actual
    s1_months = count_months_S1(df, ycur)

    st.success(f"Datos obtenidos del ZIP: {chosen_url.split('/')[-1]}.")

    # 7. Graficar (Funci√≥n sin la l√≠nea de promedio)
    st.subheader(f"Top {TOP_N_BARS} Pa√≠ses - Comparativo Semestral")
    plot_chart(table, ycur, s1_months=s1_months, parcial=parcial)


# Ejecutar
if __name__ == "__main__":
    st.markdown("### üåé Entradas A√©reas por Nacionalidad (DataTur)")
    main_flow()

Buscando ZIPs en DataTur (Nacionalidad)...
[DEBUG] Candidatos ordenados por score:
 - score 9: https://datatur.sectur.gob.mx/Documentos%20compartidos/DatosAbiertos_SIOM_NAC.zip
 - score -2: https://datatur.sectur.gob.mx/Documentos%20compartidos/CUADRO_SIOM_NAC.zip
 - score -2: https://datatur.sectur.gob.mx/Documentos%20compartidos/CUADRO_SIOM_NAC_AERO.zip

Descargando: https://datatur.sectur.gob.mx/Documentos%20compartidos/DatosAbiertos_SIOM_NAC.zip
Parseando ZIP a formato largo est√°ndar...
[DEBUG] Contenido ZIP:
 - BD_Turistas extranjeros Nacionalidad Ene12-Sep25_Prel (FMA)_Sexo.xlsx
[DEBUG] Filas encabezado: [24, 25, 35]
[DEBUG] Ej. columnas combinadas: ['2012', 'Enero', 'Acapulco, Gro.\nAguascalientes, Ags.', 'Italia\nAlemania', 'Europa', 'Hombre\nMujer', '13\n19\n20']
[DEBUG] Cobertura Ene‚ÄìJun por a√±o (primeros):
  Anio  mcount
 2012       1

A√±o elegido S1: 2012 vs 2011 [PARCIAL]


HTML interactivo guardado: entradas_aereas_nacionalidad_semestre.html

Listo ‚úÖ
- Excel: entradas_aereas_nacionalidad_semestre.xlsx
- CSV: entradas_aereas_nacionalidad_semestre.csv
- Gr√°fica: entradas_aereas_nacionalidad_semestre.html
