# Laboratorio 8 - Transformaciones con Spark

#### Edwin Ortega 22305
#### Esteban Zambrano 22119
#### Diego García 22404

%md
### Carga de Datos y Análisis Exploratorio

%md
##### Instalación a tener en cuenta

In [0]:
%pip install openpyxl

%md
### Convertir Excel a un CSV por hoja
##### Rutas, imports y utilidades 

Se debe mover los excel en la carpeta 'Data' a un volumen de databricks. Se deben cambiar las rutas a las suyas.

In [0]:
import os
import pandas as pd
from pathlib import Path

# Cambiar esto a su volumen
BASE_DIR = "/Volumes/workspace/default/lab8-ds"

SUBFOLDERS = ["fallecidos-lesionados", "hechos", "vehiculos"]
OUTPUT_ROOT = os.path.join(BASE_DIR, "csv")

In [0]:
os.makedirs(OUTPUT_ROOT, exist_ok=True)
for sub in SUBFOLDERS:
    os.makedirs(os.path.join(OUTPUT_ROOT, sub), exist_ok=True)

print("Base:", BASE_DIR)
print("Salida:", OUTPUT_ROOT)
print("Subcarpetas:", SUBFOLDERS)


In [0]:
manifest, errors = [], []

for sub in SUBFOLDERS:
    in_dir = Path(BASE_DIR) / sub
    out_dir = Path(OUTPUT_ROOT) / sub

    for f in sorted(in_dir.iterdir()):
        if not f.is_file():
            continue
        if f.suffix.lower() not in {".xlsx", ".xls"}:
            continue

        try:
            # Lee SOLO la primera hoja
            df = pd.read_excel(f, sheet_name=0)
            # Nombre base del archivo + .csv
            csv_name = f.with_suffix(".csv").name
            csv_path = out_dir / csv_name

            # Guarda CSV sin índice
            df.to_csv(csv_path, index=False)
            manifest.append((sub, str(f), str(csv_path)))
        except Exception as e:
            errors.append((str(f), repr(e)))

print(f"CSV generados: {len(manifest)}")
if errors:
    print(f"Archivos con error: {len(errors)} (muestra 5)")
    for p, err in errors[:5]:
        print("-", p, "->", err)

##### Limpieza de datos

In [0]:
INPUT_ROOT  = Path(BASE_DIR) / "csv"

# Recolecta rutas de todos los CSV
csv_files = []
for sub in SUBFOLDERS:
    folder = INPUT_ROOT / sub
    if not folder.exists():
        print(f"⚠️ No existe: {folder}")
        continue
    for f in sorted(folder.iterdir()):
        if f.is_file() and f.suffix.lower() == ".csv":
            csv_files.append((sub, f))

print(f"Archivos CSV encontrados: {len(csv_files)}")
for i, (sub, f) in enumerate(csv_files[:10], start=1):
    print(f"{i:02d}. [{sub}] {f.name}")


In [0]:
# Estandarización de datos

import unicodedata
import re

def strip_accents_lower(text: str) -> str:
    """Convierte a minúsculas, elimina acentos/diéresis y comprime espacios."""
    if text is None:
        return None
    s = str(text).lower().strip()
    s = unicodedata.normalize("NFKD", s)
    s = s.encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"\s+", " ", s)
    return s

def normalize_headers(cols):
    """Normaliza encabezados: minus, sin acentos, espacios->_, solo [a-z0-9_]."""
    norm = []
    for c in cols:
        s = strip_accents_lower(c)
        s = re.sub(r"[^a-z0-9_ ]", "", s)
        s = s.replace(" ", "_")
        s = re.sub(r"_+", "_", s).strip("_")
        norm.append(s or "col")
    return norm

def normalize_dataframe_text(df: pd.DataFrame) -> pd.DataFrame:
    """Devuelve una copia con encabezados normalizados y columnas object en minus/sin acentos."""
    out = df.copy()
    out.columns = normalize_headers(out.columns)
    obj_cols = out.select_dtypes(include=["object"]).columns.tolist()
    for c in obj_cols:
        out[c] = out[c].map(strip_accents_lower)
    return out


In [0]:
raw_data   = {}
clean_data = {}

errors = []

for sub, f in csv_files:
    try:
        # Detecta separador automáticamente; cae a coma si falla
        try:
            df = pd.read_csv(f, sep=None, engine="python", encoding="utf-8", encoding_errors="ignore")
        except Exception:
            df = pd.read_csv(f, encoding="utf-8", encoding_errors="ignore")

        df_clean = normalize_dataframe_text(df)

        key = (sub, f.name)
        raw_data[key] = df
        clean_data[key] = df_clean

    except Exception as e:
        errors.append((str(f), repr(e)))

print(f"DataFrames cargados: {len(raw_data)} | Normalizados: {len(clean_data)}")
if errors:
    print(f"⚠️ Errores en {len(errors)} archivos (muestra 5):")
    for p, err in errors[:5]:
        print("-", p, "->", err)


In [0]:
# Selección de columanas para Fallecidos/lesionados

import re
import pandas as pd

# columnas requeridas
req_fall = ["ano_ocu","mes_ocu","depto_ocu","zona_ocu","edad_per","tipo_eve","fall_les"]

sel_fallecidos = {}
faltantes_fallecidos = []

for (sub, fname), df in clean_data.items():
    if sub != "fallecidos-lesionados":
        continue

    # detecta año desde el nombre del archivo (primer 4 dígitos)
    m = re.search(r"(\d{4})", fname)
    anio = int(m.group(1)) if m else None

    # crea columnas faltantes con "ignorado"
    missing = [c for c in req_fall if c not in df.columns]
    for c in missing:
        df[c] = "ignorado"

    # registra faltantes (uno por columna faltante)
    for c in missing:
        faltantes_fallecidos.append({"anio": anio, "archivo": fname, "col_faltante": c})

    # deja solo las columnas requeridas en el orden pedido
    sel = df[req_fall].copy()
    sel_fallecidos[(sub, fname)] = sel

# reporte
reporte_fallecidos = pd.DataFrame(faltantes_fallecidos)
display(reporte_fallecidos.sort_values(["anio","col_faltante"]) if not reporte_fallecidos.empty 
        else pd.DataFrame(columns=["anio","archivo","col_faltante"]))


In [0]:
# Selección de columanas para hechos

import re
import pandas as pd

req_hechos = ["ano_ocu","hora_ocu","mes_ocu","dia_sem_ocu","depto_ocu","tipo_eve"]

sel_hechos = {}
faltantes_hechos = []

for (sub, fname), df in clean_data.items():
    if sub != "hechos":
        continue

    m = re.search(r"(\d{4})", fname)
    anio = int(m.group(1)) if m else None

    missing = [c for c in req_hechos if c not in df.columns]
    for c in missing:
        df[c] = "ignorado"

    for c in missing:
        faltantes_hechos.append({"anio": anio, "archivo": fname, "col_faltante": c})

    sel = df[req_hechos].copy()
    sel_hechos[(sub, fname)] = sel

reporte_hechos = pd.DataFrame(faltantes_hechos)
display(reporte_hechos.sort_values(["anio","col_faltante"]) if not reporte_hechos.empty 
        else pd.DataFrame(columns=["anio","archivo","col_faltante"]))


In [0]:
# Selección de columanas para vehiculos

import re
import pandas as pd

req_veh = ["ano_ocu","mes_ocu","depto_ocu","sexo_per","tipo_veh","marca_veh","color_veh","modelo_veh","tipo_eve"]

sel_vehiculos = {}
faltantes_vehiculos = []

for (sub, fname), df in clean_data.items():
    if sub != "vehiculos":
        continue

    m = re.search(r"(\d{4})", fname)
    anio = int(m.group(1)) if m else None

    missing = [c for c in req_veh if c not in df.columns]
    for c in missing:
        df[c] = "ignorado"

    for c in missing:
        faltantes_vehiculos.append({"anio": anio, "archivo": fname, "col_faltante": c})

    sel = df[req_veh].copy()
    sel_vehiculos[(sub, fname)] = sel

reporte_vehiculos = pd.DataFrame(faltantes_vehiculos)
display(reporte_vehiculos.sort_values(["anio","col_faltante"]) if not reporte_vehiculos.empty 
        else pd.DataFrame(columns=["anio","archivo","col_faltante"]))


In [0]:
import re

YEAR = 2019  # ← cambia el año

keys_veh = [k for k in sel_vehiculos.keys() if re.search(fr"\b{YEAR}\b", k[1])]
keys_veh = sorted(keys_veh, key=lambda x: x[1])

if keys_veh:
    sample_key = keys_veh[0]
    print("Mostrando (vehiculos):", sample_key)
    display(sel_vehiculos[sample_key].head(10))
else:
    print(f"No se encontró archivo de vehiculos con año {YEAR} en el nombre.")


In [0]:
# Estandarización de datos generales

import re
import pandas as pd

YEAR_MIN, YEAR_MAX = 2013, 2020

# mapas
MES_MAP = {
    1:"enero", 2:"febrero", 3:"marzo", 4:"abril", 5:"mayo", 6:"junio",
    7:"julio", 8:"agosto", 9:"septiembre", 10:"octubre", 11:"noviembre", 12:"diciembre"
}

DEPTO_MAP = {
     1:"guatemala",  2:"el progreso", 3:"sacatepequez", 4:"chimaltenango",
     5:"escuintla",  6:"santa rosa",  7:"solola",       8:"totonicapan",
     9:"quetzaltenango", 10:"suchitepequez", 11:"retalhuleu", 12:"san marcos",
    13:"huehuetenango", 14:"quiche", 15:"baja verapaz", 16:"alta verapaz",
    17:"peten", 18:"izabal", 19:"zacapa", 20:"chiquimula", 21:"jalapa", 22:"jutiapa"
}

TIPO_EVE_MAP = {
     1:"colision", 2:"choque", 3:"vuelco", 4:"caida", 5:"atropello",
     6:"perdida de control", 7:"colision contra animal", 8:"exceso de pasaje",
     9:"asfalto mojado", 10:"exceso de velocidad", 11:"desperfectos mecanicos",
    12:"incendio", 99:"ignorado"
}

def _apply_general_changes(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 1) "ignorada" -> "ignorado" en todas las columnas de texto
    obj_cols = out.select_dtypes(include=["object"]).columns.tolist()
    for c in obj_cols:
        out[c] = out[c].replace("ignorada", "ignorado")

    # 2) mes_ocu numérico -> nombre
    if "mes_ocu" in out.columns:
        _num = pd.to_numeric(out["mes_ocu"], errors="coerce")
        mask = _num.notna()
        out.loc[mask, "mes_ocu"] = _num[mask].astype(int).map(MES_MAP).fillna(out.loc[mask, "mes_ocu"])

    # 3) depto_ocu numérico -> nombre
    if "depto_ocu" in out.columns:
        _num = pd.to_numeric(out["depto_ocu"], errors="coerce")
        mask = _num.notna()
        out.loc[mask, "depto_ocu"] = _num[mask].astype(int).map(DEPTO_MAP).fillna(out.loc[mask, "depto_ocu"])

    # 4) tipo_eve numérico -> nombre
    if "tipo_eve" in out.columns:
        _num = pd.to_numeric(out["tipo_eve"], errors="coerce")
        mask = _num.notna()
        out.loc[mask, "tipo_eve"] = _num[mask].astype(int).map(TIPO_EVE_MAP).fillna(out.loc[mask, "tipo_eve"])

    return out

def _year_from_filename(fname: str):
    m = re.search(r"(\d{4})", fname)
    return int(m.group(1)) if m else None


changed_fallecidos = {}
changed_hechos = {}
changed_vehiculos = {}

for key, df in sel_fallecidos.items():
    sub, fname = key
    yr = _year_from_filename(fname)
    if yr is not None and (YEAR_MIN <= yr <= YEAR_MAX):
        changed_fallecidos[key] = _apply_general_changes(df)
    else:
        changed_fallecidos[key] = df

for key, df in sel_hechos.items():
    sub, fname = key
    yr = _year_from_filename(fname)
    if yr is not None and (YEAR_MIN <= yr <= YEAR_MAX):
        changed_hechos[key] = _apply_general_changes(df)
    else:
        changed_hechos[key] = df

for key, df in sel_vehiculos.items():
    sub, fname = key
    yr = _year_from_filename(fname)
    if yr is not None and (YEAR_MIN <= yr <= YEAR_MAX):
        changed_vehiculos[key] = _apply_general_changes(df)
    else:
        changed_vehiculos[key] = df


In [0]:
# Estandarización de datos de fallecidos-lesionados

import re
import pandas as pd

YEAR_MIN, YEAR_MAX = 2013, 2020

def _year_from_filename(fname: str):
    m = re.search(r"(\d{4})", fname)
    return int(m.group(1)) if m else None

final_fallecidos = {}

for key, df in changed_fallecidos.items():
    sub, fname = key
    yr = _year_from_filename(fname)

    if yr is None or not (YEAR_MIN <= yr <= YEAR_MAX):
        # fuera de rango: lo dejamos igual
        final_fallecidos[key] = df
        continue

    out = df.copy()

    # zona_ocu: 99 -> "ignorado"
    if "zona_ocu" in out.columns:
        znum = pd.to_numeric(out["zona_ocu"], errors="coerce")
        out.loc[znum == 99, "zona_ocu"] = "ignorado"

    # edad_per: 999 -> "ignorado"
    if "edad_per" in out.columns:
        ednum = pd.to_numeric(out["edad_per"], errors="coerce")
        out.loc[ednum == 999, "edad_per"] = "ignorado"

    # fall_les: 1 -> "fallecido"; 2 -> "lesionado"
    if "fall_les" in out.columns:
        # Maneja valores numéricos o string
        out["fall_les"] = (
            out["fall_les"]
            .astype(str).str.strip()
            .replace({"1": "fallecido", "2": "lesionado"})
        )

    for c in ["zona_ocu", "edad_per", "fall_les", "mes_ocu","depto_ocu", "tipo_eve"]:
        if c in out.columns:
            out[c] = out[c].astype("string").fillna("ignorado")

    final_fallecidos[key] = out

# Vista rápida
if final_fallecidos:
    sample_key = sorted(final_fallecidos.keys(), key=lambda k: k[1])[0]
    print("Mostrando (fallecidos-lesionados):", sample_key)
    display(final_fallecidos[sample_key].head(10))


In [0]:
# Estandarización de datos de hechos

import re
import pandas as pd

YEAR_MIN, YEAR_MAX = 2013, 2020

def _year_from_filename(fname: str):
    m = re.search(r"(\d{4})", fname)
    return int(m.group(1)) if m else None

# 1=lunes, 2=martes, ..., 7=domingo (sin tildes)
DOW_MAP = {1:"lunes", 2:"martes", 3:"miercoles", 4:"jueves", 5:"viernes", 6:"sabado", 7:"domingo"}

final_hechos = {}

for key, df in changed_hechos.items():
    sub, fname = key
    yr = _year_from_filename(fname)

    if yr is None or not (YEAR_MIN <= yr <= YEAR_MAX):
        final_hechos[key] = df
        continue

    out = df.copy()

    if "dia_sem_ocu" in out.columns:
        nums = pd.to_numeric(out["dia_sem_ocu"], errors="coerce")
        mask = nums.notna()
        out.loc[mask, "dia_sem_ocu"] = nums[mask].astype(int).map(DOW_MAP).fillna(out.loc[mask, "dia_sem_ocu"])
        # Evita problemas Arrow: homogeniza a string y rellena nulos
        out["dia_sem_ocu"] = out["dia_sem_ocu"].astype("string").fillna("ignorado")

    final_hechos[key] = out

# Vista rápida
if final_hechos:
    sample_key = sorted(final_hechos.keys(), key=lambda k: k[1])[1]
    print("Mostrando (hechos):", sample_key)
    display(final_hechos[sample_key].head(10))


In [0]:
# Estandarización de datos de vehiculos

import re
import pandas as pd

# Rangos
YEAR_MIN_GENERAL, YEAR_MAX_GENERAL = 2013, 2020
YEAR_MIN_DROP_MARCA, YEAR_MAX_DROP_MARCA = 2013, 2023

def _year_from_filename(fname: str):
    m = re.search(r"(\d{4})", fname)
    return int(m.group(1)) if m else None

# Mapas (texto ya sin acentos y en minusculas)
SEXO_MAP = {1: "hombre", 2: "mujer", 9: "ignorado"}

TIPO_VEH_MAP = {
     1: "automovil",  2: "camioneta",  3: "pick_up",      4: "motocicleta",
     5: "camion",     6: "cabezal",    7: "bus_extraurbano", 8: "jeep",
     9: "microbus",  10: "taxi",      11: "panel",       12: "bus_urbano",
    13: "tractor",   14: "moto_taxi", 15: "furgon",      16: "grua",
    17: "bus_escolar", 18: "bicicleta", 99: "ignorado"
}

COLOR_VEH_MAP = {
     1: "rojo",       2: "blanco",     3: "azul",        4: "gris",
     5: "negro",      6: "verde",      7: "amarillo",    8: "celeste",
     9: "corinto",   10: "cafe",      11: "beige",      12: "turquesa",
    13: "marfil",    14: "anaranjado", 15: "aqua",       16: "morado",
    17: "rosado",    99: "ignorado"
}

final_vehiculos = {}

for key, df in changed_vehiculos.items():
    sub, fname = key
    yr = _year_from_filename(fname)
    out = df.copy()

    # ---- Reglas generales para 2013–2020 ----
    if yr is not None and (YEAR_MIN_GENERAL <= yr <= YEAR_MAX_GENERAL):
        # sexo_per
        if "sexo_per" in out.columns:
            _num = pd.to_numeric(out["sexo_per"], errors="coerce")
            mask = _num.notna()
            out.loc[mask, "sexo_per"] = _num[mask].astype(int).map(SEXO_MAP).fillna(out.loc[mask, "sexo_per"])

        # tipo_veh
        if "tipo_veh" in out.columns:
            _num = pd.to_numeric(out["tipo_veh"], errors="coerce")
            mask = _num.notna()
            out.loc[mask, "tipo_veh"] = _num[mask].astype(int).map(TIPO_VEH_MAP).fillna(out.loc[mask, "tipo_veh"])

        # color_veh
        if "color_veh" in out.columns:
            _num = pd.to_numeric(out["color_veh"], errors="coerce")
            mask = _num.notna()
            out.loc[mask, "color_veh"] = _num[mask].astype(int).map(COLOR_VEH_MAP).fillna(out.loc[mask, "color_veh"])

    # ---- Eliminar marca_veh para 2013–2023 ----
    if yr is not None and (YEAR_MIN_DROP_MARCA <= yr <= YEAR_MAX_DROP_MARCA):
        if "marca_veh" in out.columns:
            out = out.drop(columns=["marca_veh"])
        elif "modelo_veh" in out.columns:
            out = out.drop(columns=["modelo_veh"])

    # Homogeneizar tipos a string para evitar errores Arrow
    for c in ["sexo_per", "tipo_veh", "color_veh"]:
        if c in out.columns:
            out[c] = out[c].astype("string").fillna("ignorado")

    final_vehiculos[key] = out

# Vista rápida
if final_vehiculos:
    sample_key = sorted(final_vehiculos.keys(), key=lambda k: k[1])[1]
    print("Mostrando (vehiculos):", sample_key)
    display(final_vehiculos[sample_key].head(10))


In [0]:
# Sobreescribir fallecidos/lesionados

import os
from pathlib import Path

BASE_DIR = "/Volumes/workspace/default/lab8-ds"
OUT_ROOT = Path(BASE_DIR) / "csv"

saved = 0
for (sub, fname), df in final_fallecidos.items():
    out_path = OUT_ROOT / sub / fname
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path, index=False, encoding="utf-8")
    saved += 1

print(f"CSV sobrescritos (fallecidos-lesionados): {saved}")


In [0]:
# Sobreescribir hechos

import os
from pathlib import Path

BASE_DIR = "/Volumes/workspace/default/lab8-ds"
OUT_ROOT = Path(BASE_DIR) / "csv"

saved = 0
for (sub, fname), df in final_hechos.items():
    out_path = OUT_ROOT / sub / fname
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path, index=False, encoding="utf-8")
    saved += 1

print(f"CSV sobrescritos (hechos): {saved}")

In [0]:
# Sobreescribir vehiculos

import os
from pathlib import Path

BASE_DIR = "/Volumes/workspace/default/lab8-ds"
OUT_ROOT = Path(BASE_DIR) / "csv"

saved = 0
for (sub, fname), df in final_vehiculos.items():
    out_path = OUT_ROOT / sub / fname
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path, index=False, encoding="utf-8")
    saved += 1

print(f"CSV sobrescritos (vehiculos): {saved}")


%md
##### Cargar los CSV con Spark

In [0]:
# === RUTA en Volume UC (NO /Workspace) ===
DATA_DIR = "/Volumes/workspace/default/lab8/Data"

# Hechos
spark.sql(f"""
CREATE OR REPLACE TEMP VIEW hechos_long AS
SELECT
  CAST(anio AS INT)                           AS anio,
  dim1, dim1_val, dim2, dim2_val,
  CAST(regexp_replace(valor, '[^0-9-]', '') AS INT) AS valor,
  lower(grupo) AS grupo,
  cuadro, csv
FROM read_files('{DATA_DIR}/hechos_long.csv',
                format => 'csv', header => true)
WHERE lower(coalesce(dim1_val,'')) <> 'total'
  AND lower(coalesce(dim2_val,'')) <> 'total'
""")

# Vehículos
spark.sql(f"""
CREATE OR REPLACE TEMP VIEW vehiculos_long AS
SELECT
  CAST(anio AS INT)                           AS anio,
  dim1, dim1_val, dim2, dim2_val,
  CAST(regexp_replace(valor, '[^0-9-]', '') AS INT) AS valor,
  lower(grupo) AS grupo,
  cuadro, csv
FROM read_files('{DATA_DIR}/vehiculos_long.csv',
                format => 'csv', header => true)
WHERE lower(coalesce(dim1_val,'')) <> 'total'
  AND lower(coalesce(dim2_val,'')) <> 'total'
""")

# Víctimas
spark.sql(f"""
CREATE OR REPLACE TEMP VIEW victimas_long AS
SELECT
  CAST(anio AS INT)                           AS anio,
  dim1, dim1_val, dim2, dim2_val,
  CAST(regexp_replace(valor, '[^0-9-]', '') AS INT) AS valor,
  lower(grupo) AS grupo,
  cuadro, csv
FROM read_files('{DATA_DIR}/victimas_long.csv',
                format => 'csv', header => true)
WHERE lower(coalesce(dim1_val,'')) <> 'total'
  AND lower(coalesce(dim2_val,'')) <> 'total'
""")

# --- Chequeos rápidos ---
display(spark.sql("""
SELECT 'hechos' AS tabla, COUNT(*) AS filas FROM hechos_long
UNION ALL
SELECT 'vehiculos', COUNT(*) FROM vehiculos_long
UNION ALL
SELECT 'victimas', COUNT(*) FROM victimas_long
"""))

display(spark.sql("SELECT DISTINCT anio FROM hechos_long ORDER BY anio"))
display(spark.sql("SELECT DISTINCT anio FROM vehiculos_long ORDER BY anio"))
display(spark.sql("SELECT DISTINCT anio FROM victimas_long ORDER BY anio"))

# Ejemplos
display(spark.sql("SELECT * FROM hechos_long    ORDER BY anio, cuadro LIMIT 15"))
display(spark.sql("SELECT * FROM vehiculos_long ORDER BY anio, cuadro LIMIT 15"))
display(spark.sql("SELECT * FROM victimas_long  ORDER BY anio, cuadro LIMIT 15"))


### 1. Registros por tabla + show(), describe, summary

In [0]:
DATA_DIR = "/Volumes/workspace/default/lab8/Data"

# Hechos
hechos = (spark.read.option("header", True).csv(f"{DATA_DIR}/hechos_long.csv"))
hechos.createOrReplaceTempView("hechos_long")

# Vehículos
vehiculos = (spark.read.option("header", True).csv(f"{DATA_DIR}/vehiculos_long.csv"))
vehiculos.createOrReplaceTempView("vehiculos_long")

# Víctimas
victimas = (spark.read.option("header", True).csv(f"{DATA_DIR}/victimas_long.csv"))
victimas.createOrReplaceTempView("victimas_long")


In [0]:
print(spark.catalog.tableExists("hechos_long"),
      spark.catalog.tableExists("vehiculos_long"),
      spark.catalog.tableExists("victimas_long"))


In [0]:
# Conteos
counts = [
    ("hechos",    hechos.count()),
    ("vehiculos", vehiculos.count()),
    ("victimas",  victimas.count()),
]
spark.createDataFrame(counts, ["tabla","filas"]).show(truncate=False)

# Muestras
print("\n[hechos] ejemplo:")
hechos.show(10, truncate=False)

print("\n[vehiculos] ejemplo:")
vehiculos.show(10, truncate=False)

print("\n[victimas] ejemplo:")
victimas.show(10, truncate=False)

# Describe / summary de columnas clave
(
    hechos.select("anio","dim1","dim1_val","dim2","dim2_val","valor")
    .describe().show()
)
hechos.select("anio","valor").summary().show()

(
    vehiculos.select("anio","dim1","dim1_val","dim2","dim2_val","valor")
    .describe().show()
)
vehiculos.select("anio","valor").summary().show()

(
    victimas.select("anio","dim1","dim1_val","dim2","dim2_val","valor")
    .describe().show()
)
victimas.select("anio","valor").summary().show()
