1. Importar librerías y definir funciones base

In [0]:
%pip install openpyxl
%pip install pyreadstat

In [0]:
%restart_python

## Carga desde archivo y construcción dicts

In [0]:
import os
import pandas as pd
import pyreadstat
from itertools import chain

directories = [
    "/Volumes/workspace/default/fallecidos_lesionados/",
    "/Volumes/workspace/default/hechos_transito/",
    "/Volumes/workspace/default/vehiculos_involucrados/"
]

for directory in directories:
    try:
        files = [f.name for f in dbutils.fs.ls(directory)]
        
        for file in files:
            lower_file = file.lower()
            file_path = os.path.join(directory, file)
            
            # XLSX
            if lower_file.endswith(".xlsx"):
                csv_path = file_path.replace(".xlsx", ".csv")
                try:
                    df = pd.read_excel(file_path)
                    df.to_csv(csv_path, index=False)
                    print(f"Convertido XLSX → CSV: {file_path} → {csv_path}")
                    
                    # Borrar original
                    dbutils.fs.rm(file_path)
                    print(f"Borrado XLSX: {file_path}")
                except Exception as e:
                    print(f"Error con XLSX {file_path}: {e}")
            
            # SAV
            elif lower_file.endswith(".sav"):
                csv_path = file_path.replace(".sav", ".csv")
                try:
                    df, meta = pyreadstat.read_sav(file_path)
                    df.to_csv(csv_path, index=False)
                    print(f"Convertido SAV → CSV: {file_path} → {csv_path}")
                    
                    # Borrar original
                    dbutils.fs.rm(file_path)
                    print(f"Borrado SAV: {file_path}")
                except Exception as e:
                    print(f"Error con SAV {file_path}: {e}")
                    
    except Exception as e:
        print(f"Error al procesar directorio {directory}: {e}")



Unir TODOS los CSV de una carpeta en un solo DataFrame (robusto a columnas distintas)

In [0]:
from pyspark.sql import functions as F
DIR_HECHOS = "/Volumes/workspace/default/hechos_transito"
DIR_VEHICULOS = "/Volumes/workspace/default/vehiculos_involucrados"
DIR_FALLECIDOS = "/Volumes/workspace/default/fallecidos_lesionados"

# Opción simple (si todos los CSV ya tienen mismas columnas):
hechos_raw = (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(f"{DIR_HECHOS}/*.csv")
    .withColumn("source_file", F.col("_metadata.file_path"))
)

display(hechos_raw.limit(5))
hechos_raw.printSchema()

vehiculos_raw = (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(f"{DIR_VEHICULOS}/*.csv")
    .withColumn("source_file", F.col("_metadata.file_path"))
)

display(vehiculos_raw.limit(5))
vehiculos_raw.printSchema()

fallecidos_raw = (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(f"{DIR_FALLECIDOS}/*.csv")
    .withColumn("source_file", F.col("_metadata.file_path"))
)

display(fallecidos_raw.limit(5))
fallecidos_raw.printSchema()

In [0]:
from pyspark.sql import functions as F, types as T

# 1) alias de columnas → nombre canónico
ALIASES = {
    "num_hecho":      ["num_hecho", "num", "num_correl", "num_corre", "n_um_corre", "num_correlativo", "núm_corre"],
    "anio":           ["anio", "ano_ocu", "año_ocu", "anio_ocu", "ano", "año"],
    "mes":            ["mes", "mes_ocu"],
    "dia":            ["dia", "día", "dia_ocu", "día_ocu"],
    "dia_sem":        ["dia_sem_ocu", "día_sem_ocu", "dia_sem", "día_sem"],
    "hora":           ["hora", "hora_ocu"],
    "g_hora":         ["g_hora", "g_hora_5"],
    "depto":          ["depto_ocu", "depto"],
    "mupio":          ["mupio_ocu", "muni_ocu", "municipio", "mupio"],
    "zona":           ["zona_ocu", "zona"],
    "area":           ["areag_ocu", "area_ocu", "area"],
    "tipo_accidente": ["tipo_eve", "tipo_evento", "tipo", "tipo_acc", "tipo_accidente"],
    "causa_acc":      ["causa_acc", "causa"],
    "sexo_pil":       ["sexo_pil", "sexo_piloto", "sexo", "sexo_per"],
    "edad_pil":       ["edad_pil", "edad", "edad_piloto", "edad_per"],
    "g_edad":         ["g_edad_2", "g_edad", "g_edad_80ymás", "g_edad_60ymás", "edad_quinquenales"],
    "mayor_menor":    ["mayor_menor"],
    "tipo_veh":       ["tipo_veh", "tipo_vehiculo"],
    "marca_veh":      ["marca_veh", "marca"],
    "color_veh":      ["color_veh", "color"],
    "modelo_veh":     ["modelo_veh", "modelo"],
    "g_modelo_veh":   ["g_modelo_veh"],
    "estado_pil":     ["estado_pil", "estado_piloto", "estado", "estado_con", "fall_les"],
    "intencionalidad":["int_o_noint"],
    "source_file":    ["source_file", "_source_file"]
}

def first_present(colnames, candidates):
    cols_lower = {c.lower(): c for c in colnames}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    return None

def canonize(df):
    # construir mapeo existente -> canon
    mapping = {}
    for canon, cands in ALIASES.items():
        src = first_present(df.columns, cands)
        if src and src != canon:
            mapping[src] = canon

    # aplicar renombres
    for src, dst in mapping.items():
        df = df.withColumnRenamed(src, dst)

    return df

hechos_std = canonize(hechos_raw)
fallecidos_std = canonize(fallecidos_raw)
vehiculos_std = canonize(vehiculos_raw)


In [0]:
from pyspark.sql import functions as F
from itertools import chain

for df_name in ["hechos_std", "fallecidos_std", "vehiculos_std"]:
    df = locals()[df_name]
    if "anio" not in df.columns and "source_file" in df.columns:
        df = df.withColumn(
            "anio",
            F.regexp_extract("source_file", r"(20\d{2})", 1).cast("int")
        )
        locals()[df_name] = df


CODIGOS_DEPARTAMENTOS = {
    1:"Guatemala",2:"El Progreso",3:"Sacatepéquez",4:"Chimaltenango",5:"Escuintla",6:"Santa Rosa",
    7:"Sololá",8:"Totonicapán",9:"Quetzaltenango",10:"Suchitepéquez",11:"Retalhuleu",12:"San Marcos",
    13:"Huehuetenango",14:"Quiché",15:"Baja Verapaz",16:"Alta Verapaz",17:"Petén",18:"Izabal",
    19:"Zacapa",20:"Chiquimula",21:"Jalapa",22:"Jutiapa"
}
EQUIVALENCIAS_DEPARTAMENTOS = {
    "Guatemala":"Guatemala","Alta Verapaz":"Alta Verapaz","Baja Verapaz":"Baja Verapaz",
    "Chimaltenango":"Chimaltenango","Chiquimula":"Chiquimula","El Progreso":"El Progreso",
    "Escuintla":"Escuintla","Huehuetenango":"Huehuetenango","Izabal":"Izabal",
    "Jalapa":"Jalapa","Jutiapa":"Jutiapa","Peten":"Petén","Pet?n":"Petén","Petén":"Petén",
    "Quiche":"Quiché","Quich?":"Quiché","Quiché":"Quiché","Quetzaltenango":"Quetzaltenango",
    "Retalhuleu":"Retalhuleu","Sacatepequez":"Sacatepéquez","Sacatep?quez":"Sacatepéquez",
    "Sacatepéquez":"Sacatepéquez","San Marcos":"San Marcos","Santa Rosa":"Santa Rosa",
    "Solola":"Sololá","Solol?":"Sololá","Sololá":"Sololá","Suchitepequez":"Suchitepéquez",
    "Suchitep?quez":"Suchitepéquez","Suchitepéquez":"Suchitepéquez","Totonicapan":"Totonicapán",
    "Totonicap?n":"Totonicapán","Totonicapán":"Totonicapán","Zacapa":"Zacapa"
}

map_cod  = F.create_map([F.lit(x) for x in chain(*{str(k):v for k,v in CODIGOS_DEPARTAMENTOS.items()}.items())])
map_name = F.create_map([F.lit(x) for x in chain(*EQUIVALENCIAS_DEPARTAMENTOS.items())])

def decodificar_depto(df, col="depto"):
    if col not in df.columns:
        return df
    s = F.col(col).cast("string")
    # quitar .0 si viene '1.0', '22.0', ...
    s_no_dot = F.regexp_replace(s, "\\.0+$", "")
    # si tras quitar .0 quedó solo dígitos => usar código; si no, mapear nombre
    only_extras = F.translate(s_no_dot, "0123456789", "")
    nombre = F.when(F.length(only_extras) == 0, map_cod.getItem(s_no_dot)) \
              .otherwise(map_name.getItem(s))
    return df.withColumn(col, F.coalesce(nombre, F.lit("Desconocido")))


hechos_std     = decodificar_depto(hechos_std, "depto")
vehiculos_std  = decodificar_depto(vehiculos_std, "depto")
fallecidos_std = decodificar_depto(fallecidos_std, "depto")

In [0]:
from pyspark.sql import functions as F, types as T

INT_COLS = [
    "anio","mes","dia","dia_sem","hora","g_hora","depto","mupio","zona","area",
    "tipo_accidente","causa_acc","sexo_pil","edad_pil","g_edad","mayor_menor",
    "tipo_veh","marca_veh","color_veh","modelo_veh","g_modelo_veh","estado_pil"
]

TOKENS_NULOS_LIST = [
    "ignorada","na","n/a","s/d","sd","sin dato",""," ",
    "Ignorada","IGNORADA","NA","N/A","S/D","SD","Sin dato","SIN DATO"
]

# Si quieres COERCIONAR rangos "2010-2019" a 2010, deja esto en True; si no, False para poner NULL
COERCE_RANGES_TO_START = True

def to_int_strict(colname: str):
    s = F.trim(F.col(colname).cast("string"))
    s = F.when(F.lower(s).isin([t.lower() for t in TOKENS_NULOS_LIST]), None).otherwise(s)
    # Quitar terminaciones .0, .00...
    s = F.regexp_replace(s, r"\.0+$", "")
    # Opcional: si viene "2010-2019", toma el primer año; si no quieres, comenta esta línea
    if COERCE_RANGES_TO_START:
        s = F.regexp_replace(s, r"^(\d{4})-\d{4}$", r"\1")

    # Solo enteros válidos: - opcional AL INICIO y dígitos
    s = F.when(s.rlike(r"^-?\d+$"), s).otherwise(None)
    return s.cast(T.IntegerType())

def normalize_int_columns(df, int_cols):
    for c in int_cols:
        if c in df.columns:
            df = df.withColumn(c, to_int_strict(c))
    # Regla específica: 9999 en modelo_veh → NULL
    if "modelo_veh" in df.columns:
        df = df.withColumn(
            "modelo_veh",
            F.when(F.col("modelo_veh") == 9999, None).otherwise(F.col("modelo_veh"))
        )
    return df

hechos_std     = normalize_int_columns(hechos_std, INT_COLS)
vehiculos_std  = normalize_int_columns(vehiculos_std, INT_COLS)
fallecidos_std = normalize_int_columns(fallecidos_std, INT_COLS)


In [0]:
OUT_SILVER_HECHOS     = "/Volumes/workspace/default/hechos_transito/silver"
OUT_SILVER_VEHICULOS  = "/Volumes/workspace/default/vehiculos_involucrados/silver"
OUT_SILVER_FALLECIDOS = "/Volumes/workspace/default/fallecidos_lesionados/silver"

dfs = [hechos_std, vehiculos_std, fallecidos_std]  # ojo: orden consistente
OUTS = [OUT_SILVER_HECHOS, OUT_SILVER_VEHICULOS, OUT_SILVER_FALLECIDOS]

for df, outp in zip(dfs, OUTS):
    (df.coalesce(1)
       .write.mode("overwrite")
       .parquet(outp))
    print("OK →", outp)


Helpers reutilizables

### Preguntas a responder
1. Contar registros por tabla (long)

## #1 – Conteos, .show(), describe y summary (por tabla)

In [0]:
from pyspark.sql.types import NumericType

hechos_silver = spark.read.parquet(OUT_SILVER_HECHOS)
vehiculos_silver = spark.read.parquet(OUT_SILVER_VEHICULOS)
fallecidos_silver = spark.read.parquet(OUT_SILVER_FALLECIDOS)

def ae_conteos_describe_summary(nombre: str, sdfs: dict, n_show: int = 5):
    print(f"\n===== {nombre.upper()} =====")
    total_registros = 0
    for key, sdf in sdfs.items():
        c = sdf.count()
        total_registros += c
        print(f"{key:20s} -> {c:6d} registros")
    print(f"TOTAL {nombre}: {total_registros}")

    # Muestra de una hoja representativa (la primera)
    if sdfs:
        first_key = list(sdfs.keys())[0]
        print(f"\n--- Ejemplo .show() :: {first_key} ---")
        sdfs[first_key].show(n_show, truncate=False, vertical=True)

        # describe/summary de columnas numéricas
        num_cols = [f.name for f in sdfs[first_key].schema.fields if isinstance(f.dataType, NumericType)]
        if num_cols:
            print(f"\n--- describe(numéricas) :: {first_key} ---")
            sdfs[first_key].select(*num_cols).describe().show(truncate=False, vertical=True)

            print(f"\n--- summary(numéricas) :: {first_key} ---")
            sdfs[first_key].select(*num_cols).summary("count","mean","stddev","min","25%","50%","75%","max").show(truncate=False, vertical=True)
        else:
            print(f"\n--- {first_key}: no hay columnas numéricas detectadas ---")

ae_conteos_describe_summary("hechos", {"hechos": hechos_silver})
ae_conteos_describe_summary("vehiculos", {"vehiculos": vehiculos_silver})
ae_conteos_describe_summary("fallecidos", {"fallecidos": fallecidos_silver})

## #2 – Años disponibles por tabla y validación

In [0]:
import re

EXPECTED_YEARS = set(range(2013, 2024))  # 2013..2023
YEAR_RX = re.compile(r'(?<!\d)(20\d{2})(?:\.0)?(?!\d)')

def extract_year_from_col(colname: str) -> int | None:
    m = YEAR_RX.search(str(colname))
    if not m: return None
    try:
        return int(m.group(1))
    except:
        return None

def detect_years_in_sdf(df) -> set[int]:
    """Detecta años en nombres de columnas de un Spark DataFrame."""
    years = set()
    for c in df.columns:
        y = extract_year_from_col(c)
        if y:
            years.add(y)
    return years

def report_years_df(df, titulo: str):
    print(f"\n===== Verificación de años: {titulo} =====")
    found = detect_years_in_sdf(df)
    missing = sorted(EXPECTED_YEARS - found)
    outside = sorted(y for y in found if y not in EXPECTED_YEARS)
    print(f"Encontrados: {sorted(found) if found else '—'} | "
          f"faltantes vs 2013–2023: {missing if missing else 'ninguno'} | "
          f"fuera de rango: {outside if outside else 'ninguno'}")
    return found

years_hechos     = report_years_df(hechos_silver, "hechos")
years_vehiculos  = report_years_df(vehiculos_silver, "vehiculos")
years_fallecidos = report_years_df(fallecidos_silver, "fallecidos")

print("\n¿Coinciden los conjuntos de años (intersección)?")
intersection_all = years_hechos & years_vehiculos & years_fallecidos
print("Intersección común:", sorted(intersection_all) if intersection_all else "—")

### #3 – Valores distintos de 'tipo de accidente'   (buscamos columnas candidatas por nombre aproximado)

In [0]:
import re, unicodedata
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType, StructType, StructField

# --- normalizadores ---
def _norm_text(s):
    if s is None: return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(ch for ch in s if unicodedata.category(ch) != "Mn")  # quita acentos
    s = re.sub(r"\s+", " ", s)
    return s

def _norm_colname(s):
    if s is None: return ""
    t = str(s).strip().lower()
    t = (t.replace("á","a").replace("é","e").replace("í","i")
           .replace("ó","o").replace("ú","u").replace("ñ","n"))
    t = re.sub(r"[\s\-]+", "_", t)
    return t

# --- catálogo canónico ---
CANON = ["colision","Atropello","Derrape","Choque","Vuelco","Embarrancó","Encunetó","Caída","Ignorado"]
MAP_NORM_TO_CANON = {
    "colision":"colision","colisiones":"colision","colision multiple":"colision",
    "atropello":"Atropello","atropellos":"Atropello",
    "derrape":"Derrape","derrapes":"Derrape",
    "choque":"Choque","choques":"Choque",
    "vuelco":"Vuelco","vuelcos":"Vuelco",
    "embarranco":"Embarrancó","embarranco multiple":"Embarrancó",
    "encuneto":"Encunetó","encunetamiento":"Encunetó",
    "caida":"Caída","caidas":"Caída",
    "ignorado":"Ignorado","desconocido":"Ignorado","no especificado":"Ignorado",
    "sin dato":"Ignorado","na":"Ignorado","n/a":"Ignorado",
}

# --- UDF Spark para canonizar ---
@F.udf(returnType=StringType())
def canon_val_udf(val):
    if val is None: return None
    n = _norm_text(val)
    if not n: return None
    return MAP_NORM_TO_CANON.get(n, "Ignorado")

# --- esquema estándar ---
SCHEMA_ACCIDENTES = StructType([
    StructField("sheet", StringType(), True),
    StructField("accidente", StringType(), True),
    StructField("count", DoubleType(), True),
])

# --- funciones auxiliares ---
def _find_acc_col(sdf):
    """Busca columna tipo_de_accidente (long)"""
    for c in sdf.columns:
        nc = _norm_colname(c)
        if nc == "tipo_de_accidente" or nc.startswith("tipo_de_accid"):
            return c
    return None

def accidentes_tidy_spark(sdf, sheet_name: str):
    """
    Construye DataFrame tidy (sheet, accidente, count) desde Spark DF,
    detectando automáticamente formato long + wide.
    """
    # --- formato long ---
    tipo_col = _find_acc_col(sdf)
    if tipo_col:
        df = sdf.withColumn("__tipo__", canon_val_udf(F.col(tipo_col)))
        # columnas numéricas candidatas
        num_cols = [c for c,t in sdf.dtypes if t in ("int","bigint","double","float") and c != tipo_col]
        if not num_cols:
            # si no hay columnas numéricas, devuelve vacío
            return spark.createDataFrame([], schema=SCHEMA_ACCIDENTES)
        # sumar por tipo
        exprs = [F.coalesce(F.col(c).cast(DoubleType()), F.lit(0.0)) for c in num_cols]
        total_expr = sum(exprs)
        tidy = (df.groupBy("__tipo__")
                  .agg(F.sum(total_expr).alias("count"))
                  .withColumnRenamed("__tipo__", "accidente")
                  .withColumn("sheet", F.lit(sheet_name))
                  .select("sheet","accidente","count"))
        return tidy

    # --- formato wide ---
    # columnas que correspondan a CANON
    cols_to_sum = [c for c in sdf.columns if _norm_text(c) in MAP_NORM_TO_CANON]
    if not cols_to_sum:
        return spark.createDataFrame([], schema=SCHEMA_ACCIDENTES)

    exprs = [F.coalesce(F.col(c).cast(DoubleType()), F.lit(0.0)).alias(MAP_NORM_TO_CANON[_norm_text(c)]) for c in cols_to_sum]
    df_sum = sdf.select(*exprs).groupBy().sum().collect()[0].asDict()
    tidy_list = [(sheet_name, k, float(df_sum.get(k,0.0))) for k in CANON]
    return spark.createDataFrame(tidy_list, schema=SCHEMA_ACCIDENTES)

# --- USO ---
acc_hechos_sdf     = accidentes_tidy_spark(hechos_silver, "hechos")
acc_vehiculos_sdf  = accidentes_tidy_spark(vehiculos_silver, "vehiculos")
acc_fallecidos_sdf = accidentes_tidy_spark(fallecidos_silver, "fallecidos")

# --- Ejemplos de consulta ---
print("\nTop 10 (hechos) por count:")
(acc_hechos_sdf
 .groupBy("accidente")
 .agg(F.sum("count").alias("total"))
 .orderBy(F.desc("total"))
 .show(10, truncate=False))

print("\nHojas que sólo tuvieron 'Ignorado' (vehículos):")
from functools import reduce
conds = [F.col(c).isNull() | (F.col(c)==0) for c in CANON if c!="Ignorado"]
(acc_vehiculos_sdf
 .groupBy("sheet")
 .pivot("accidente")
 .agg(F.sum("count"))
 .where(F.col("Ignorado").isNotNull() & reduce(lambda a,b: a & b, conds))
 .select("sheet","Ignorado")
 .show(truncate=False))


### #4 – # de departamentos únicos por base(detecta columna 'departamento' aproximada)

In [0]:
def count_departamentos_unique(dict_sdfs: dict, titulo: str,
                               dep_cands=("departamento","departamentos","depto","dept","depart","depar")):
    print(f"\n===== Departamentos únicos :: {titulo} =====")
    deptos = set()
    cols_encontradas = 0

    # Aux: si no tienes ya definida find_first_column, deja esto aquí
    def find_first_column(candidates: list[str], columns: list[str]) -> str | None:
        for col in columns:
            low = col.lower()
            for cand in candidates:
                if cand in low:  # match por substring
                    return col
        return None

    for key, sdf in dict_sdfs.items():
        col = find_first_column(list(dep_cands), sdf.columns)
        if not col:
            continue
        cols_encontradas += 1

        vals = (sdf
                .select(F.col(col).cast("string").alias("departamento"))
                .where(F.col("departamento").isNotNull() & (F.col("departamento") != ""))
                .distinct()
                .toPandas()["departamento"])

        for v in vals:
            if v is not None:
                deptos.add(str(v).strip())

    print(f"Total únicos (unión de hojas): {len(deptos)}")
    print(f"Columnas 'departamento' detectadas en {cols_encontradas} hojas")
    if deptos:
        print("Ejemplos:", sorted(list(deptos))[:15])
count_departamentos_unique(hechos_sdfs, "hechos")
count_departamentos_unique(vehiculos_sdfs, "vehiculos")
count_departamentos_unique(lesionados_sdfs, "lesionados")
count_departamentos_unique(fallecidos_sdfs, "fallecidos")


5. ¿Cuál es el total de accidentes por año y departamento?

In [0]:
acc_por_anio_dep = (hechos_silver
                    .groupBy("anio", "depto")
                    .agg(F.count("*").alias("total_accidentes"))
                    )

display(acc_por_anio_dep)

Databricks visualization. Run in Databricks to view.

6. ¿Qué día de la semana registra más accidentes en 2024?

In [0]:
# Se cambia 2024 por 2023 debido a que no se tienen los datos del 2024
hechos_std.select("dia_sem", "anio").where(F.col("anio") == "2023").groupBy("dia_sem").count().display()

Databricks visualization. Run in Databricks to view.