# Encabezado y configuración

In [1]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("data")          # Carpeta con los .sav
OUTPUT_FILE = DATA_DIR / "db.csv"

# Utilidad para leer archivos SPSS

In [2]:
def read_spss(name: str, *, usecols: list[str] | None = None) -> pd.DataFrame:
    """Lee un .sav dentro de DATA_DIR y devuelve un DataFrame."""
    return pd.read_spss(DATA_DIR / name, usecols=usecols)

# Hogares de referencia (NUMHOG)

In [3]:
personas_min = read_spss("PERSONAS.sav", usecols=["NUMHOG", "PPA09"])
hogares_completos = (
    personas_min.loc[personas_min.PPA09 == "COMPLETA", "NUMHOG"]
    .drop_duplicates()
    .sort_values()
)
resumen = pd.DataFrame(index=hogares_completos, dtype="float")


# Donaciones – pobreza y P01F03

In [4]:
donaciones = read_spss(
    "DONACIONES.sav",
    usecols=["NUMHOG", "POBREZA", "P01F03"],
)

donaciones_grouped = (
    donaciones.groupby("NUMHOG").agg(
        pobreza=("POBREZA", "first"),   # asumimos un valor por hogar
        P01F03=("P01F03", "sum"),
    )
    .reindex(resumen.index)
)

resumen = resumen.join(donaciones_grouped)


# Compras al crédito

In [5]:
credito = read_spss(
    "Compras al crédito - encabezado-.sav",
    usecols=["NUMHOG", "P15B02", "P15B04", "P15B06A"],
)

credito_grouped = (
    credito.groupby("NUMHOG").agg(
        P15B02=("P15B02", "sum"),
        P15B04=("P15B04", "sum"),
        P15B06A=("P15B06A", "sum"),
    ).reindex(resumen.index)
)

resumen = resumen.join(credito_grouped)


# Negocios no agropecuarios

In [6]:
negocios = read_spss(
    "Negocios No Agropecuarios Encabezado.sav",
    usecols=["NUMHOG", "P13A02A"],
)
resumen["P13A02A"] = (
    negocios.groupby("NUMHOG")["P13A02A"].sum().reindex(resumen.index)
)


# Variables de PERSONAS.sav

In [7]:
vars_sum = [
    "P09F03B", "P09F03C", "P09F04B", "P09F04C", "P09F05B", "P09F05C",
    "P09F06B", "P09F06C", "P09F07B", "P09F07C", "P09F08B", "P09F08C",
    "P09F09B", "P09F09C", "P10B20B", "P11A05B", "P11A06B",
]
vars_mean = [
    "P06B10A", "P06B10B", "P09A03B", "P09A03C",
    "P09B02B", "P09B02C",
]

personas_full = read_spss(
    "PERSONAS.sav",
    usecols=[
        "NUMHOG", "P06B01", "P10B01", "P10B08",
        *vars_sum, *vars_mean,
    ],
)

# Conteo de “Si” en P06B01
personas_full["P06B01_SI"] = (personas_full.P06B01 == "Si").astype(int)

aggs = {
    **{v: (v, "sum")  for v in vars_sum},
    **{v: (v, "mean") for v in vars_mean},
    "P06B01": ("P06B01_SI", "sum"),
    "P10B01": ("P10B01",   "mean"),
    "P10B08": ("P10B08",   "mean"),
}

personas_grouped = (
    personas_full.groupby("NUMHOG").agg(**aggs).reindex(resumen.index)
)

resumen = resumen.join(personas_grouped)


# Limpieza final y exportación

In [8]:
# Rellena NaN numéricos con 0; deja los no‑numéricos tal cual
num_cols = resumen.select_dtypes("number").columns
resumen[num_cols] = resumen[num_cols].fillna(0)

print(resumen.info(show_counts=True))

resumen.to_csv(OUTPUT_FILE, index_label="NUMHOG")
print(f"Archivo exportado a: {OUTPUT_FILE}")


<class 'pandas.core.frame.DataFrame'>
Index: 11536 entries, 1.0 to 11536.0
Data columns (total 32 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   pobreza  11536 non-null  category
 1   P01F03   11536 non-null  float64 
 2   P15B02   11536 non-null  float64 
 3   P15B04   11536 non-null  float64 
 4   P15B06A  11536 non-null  float64 
 5   P13A02A  11536 non-null  float64 
 6   P09F03B  11536 non-null  float64 
 7   P09F03C  11536 non-null  float64 
 8   P09F04B  11536 non-null  float64 
 9   P09F04C  11536 non-null  float64 
 10  P09F05B  11536 non-null  float64 
 11  P09F05C  11536 non-null  float64 
 12  P09F06B  11536 non-null  float64 
 13  P09F06C  11536 non-null  float64 
 14  P09F07B  11536 non-null  float64 
 15  P09F07C  11536 non-null  float64 
 16  P09F08B  11536 non-null  float64 
 17  P09F08C  11536 non-null  float64 
 18  P09F09B  11536 non-null  float64 
 19  P09F09C  11536 non-null  float64 
 20  P10B20B  11536 non-null  floa