1. Importar librerías y definir funciones base

In [0]:
%pip install openpyxl

In [0]:
%restart_python

In [0]:
import re
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, NumericType
from pyspark.sql import functions as F

# Spark session
spark = SparkSession.builder.getOrCreate()

# Ruta de tu archivo
excel_path = "/Volumes/workspace/default/pncc/20250527162011hk9xzLjtlLyIqA5fF0FY3udjjRUQlTkq.xlsx"

# Lista de palabras para detectar encabezados
header_list = [
    "departamento","vehículo","vehiculo","mes de ocurrencia","día de la semana","día de ocurrencia",
    "hora de ocurrencia","condición del conductor y sexo","grupos de edad",
    "tipo de vehículo y sexo de la persona","zona de ocurrencia","tipo de accidente y sexo",
    "tipo","clase","marca","modelo de vehículo","sexo","grupo","edad","tipo de vehículo",
    "clases de vehículos","color de vehículo"
]

# Función para detectar y limpiar cuadros
def read_cuadro(sheet_num, categoria):
    sheet = f"cuadro {sheet_num}"
    raw = pd.read_excel(excel_path, sheet_name=sheet, header=None)

    # Buscar fila de encabezado
    header_idx = None
    for i in range(len(raw)):
        row = raw.iloc[i].astype(str).str.lower()
        if any(word in row.tolist() for word in header_list):
            header_idx = i
            break

    if header_idx is None:
        return None

    header_row = raw.iloc[header_idx].tolist()
    next_row   = raw.iloc[header_idx + 1].tolist()

    # Combinar encabezados
    combined_header = []
    for h, n in zip(header_row, next_row):
        if pd.isna(h) or str(h).strip() == "":
            combined_header.append(n)
        elif str(h).lower() in ["año de ocurrencia", "mes de ocurrencia","tipo de accidente", 
                                "día de la semana","total","hombre","mujer","ignorado","grupos de edad"]:
            combined_header.append(n)
        else:
            combined_header.append(h)

    df = raw.iloc[header_idx + 2:].copy()
    df.columns = combined_header

    # Limpieza
    df = df.dropna(how="all")
    df = df[~df.iloc[:,0].astype(str).str.contains("fuente|nota|cuadro|serie", case=False, na=False)]
    df.columns = [str(c).strip() for c in df.columns]

    # Agregar metadata
    df["categoria"] = categoria
    df["cuadro"] = sheet_num

    return df

# Función para procesar varios cuadros
def build_dict(rango, categoria):
    cuadros = {}
    for i in rango:
        try:
            df = read_cuadro(i, categoria)
            if df is not None:
                cuadros[f"{categoria}_{i}"] = df
        except Exception as e:
            print(f"Error en cuadro {i}: {e}")
    return cuadros


In [0]:
hechos_dfs     = build_dict(range(1, 17), "hechos")
vehiculos_dfs  = build_dict(range(17, 29), "vehiculos")
lesionados_dfs = build_dict(range(31, 47), "lesionados")
fallecidos_dfs = build_dict(range(47, 63), "fallecidos")

print("Hechos:", list(hechos_dfs.keys())[:3])
print("Vehículos:", list(vehiculos_dfs.keys())[:3])
print("Lesionados:", list(lesionados_dfs.keys())[:3])
print("Fallecidos:", list(fallecidos_dfs.keys())[:3])


3. Funciones auxiliares para normalizar columnas y pasar a Spark

In [0]:
def safe_col(name: str) -> str:
    n = str(name).strip()
    n = re.sub(r'\.0$', '', n)
    n = n.replace('%','pct')
    n = re.sub(r'\s+','_', n)
    n = re.sub(r'[^0-9a-zA-Z_]', '_', n)
    if re.match(r'^\d', n):
        n = f"y{n}"
    return n

def make_unique(cols):
    seen, out = {}, []
    for c in cols:
        base = c if c else "col"
        if base not in seen:
            seen[base] = 0
            out.append(base)
        else:
            seen[base]+=1
            out.append(f"{base}_{seen[base]}")
    return out

def to_scalar(x):
    if isinstance(x,(list,dict,tuple,set)):
        return str(x)
    return x

# Función pipeline: limpia y convierte a Spark DF
def df_to_spark(df: pd.DataFrame):
    # quitar filas con 'total'
    mask_total = df.apply(lambda r: r.astype(str).str.strip().str.lower().eq("total").any(), axis=1)
    df_clean = df.loc[~mask_total].copy()

    df_clean.columns = [safe_col(c) for c in df_clean.columns]
    df_clean = df_clean.dropna(axis=1, how='all')
    df_clean.columns = make_unique(df_clean.columns)
    df_clean = df_clean.applymap(to_scalar)
    df_clean = df_clean.where(pd.notnull(df_clean), None)

    # detectar columnas numéricas
    col_is_numeric = {}
    for c in df_clean.columns:
        ser = pd.to_numeric(df_clean[c], errors="coerce")
        ratio = ser.notna().mean() if len(ser) else 0.0
        if ratio >= 0.8:
            df_clean[c] = ser.astype(float)
            col_is_numeric[c] = True
        else:
            df_clean[c] = df_clean[c].astype("string")
            col_is_numeric[c] = False

    schema = StructType([
        StructField(c, DoubleType() if col_is_numeric.get(c,False) else StringType(), True)
        for c in df_clean.columns
    ])

    return spark.createDataFrame(df_clean.astype(object), schema=schema)


4. Verificar años en cada colección

In [0]:
EXPECTED_YEARS = set(range(2020,2025))
YEAR_RX = re.compile(r'(?<!\d)(20\d{2})(?:\.0)?(?!\d)')

def extract_year_from_col(col):
    m = YEAR_RX.search(str(col))
    return int(m.group(1)) if m else None

def detect_years_in_df(df: pd.DataFrame):
    return {y for c in df.columns if (y:=extract_year_from_col(c))}

def report_years_for_dict(dfs, title):
    print(f"\n===== {title} =====")
    for key,df in dfs.items():
        found = detect_years_in_df(df)
        missing = sorted(EXPECTED_YEARS - found)
        outside = sorted(y for y in found if y not in EXPECTED_YEARS)
        print(f"{key}: {sorted(found) if found else '—'} | faltantes={missing or 'ninguno'} | fuera={outside or 'ninguno'}")

# Ejecutar reportes
report_years_for_dict(hechos_dfs, "hechos")
report_years_for_dict(vehiculos_dfs, "vehiculos")
report_years_for_dict(lesionados_dfs, "lesionados")
report_years_for_dict(fallecidos_dfs, "fallecidos")


5. Ejemplo: mostrar Spark DF con estadísticas

In [0]:
# Ejemplo con hechos_1
sdf_hechos = df_to_spark(hechos_dfs["hechos_1"])
print("Total registros:", sdf_hechos.count())

sdf_hechos.show(10, truncate=False)

# describe y summary de columnas numéricas
num_cols = [f.name for f in sdf_hechos.schema.fields if isinstance(f.dataType, NumericType)]
if num_cols:
    sdf_hechos.select(*num_cols).describe().show()
    sdf_hechos.select(*num_cols).summary("count","mean","stddev","min","25%","50%","75%","max").show()
