# Carga de Bibliotecas y definición de constantes

In [13]:
from pathlib import Path
import logging
from logging.handlers import RotatingFileHandler
import sys

BASE_DIR = Path.cwd().resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw"
INTERIM_DIR = BASE_DIR / "data" / "interim"
LOG_DIR = INTERIM_DIR / "logs"
LOG_FILE = LOG_DIR / "00_creacion_abt_agrupados.log"

fmt = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
datefmt = "%Y-%m-%d %H:%M:%S"

logger = logging.getLogger("creacion_abt")
logger.setLevel(logging.INFO)

file_handler = RotatingFileHandler(LOG_FILE, maxBytes=5_000_000, backupCount=5, encoding="utf-8")
file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt))

stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt))

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

logger.info("=== Inicio Notebook 00: Creación ABT Agrupados ===")
logger.info(f"BASE_DIR: {BASE_DIR}")

import pandas as pd
import numpy as np
import json
import unicodedata, re
from time import perf_counter
from pathlib import Path

from pandas.api.types import (
    is_numeric_dtype,
    is_bool_dtype
)
from pandas import CategoricalDtype

BASE_DIR = Path.cwd().resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw"
INTERIM_DIR = BASE_DIR / "data" / "interim"

CSV_PATH = RAW_DIR / "abt_PE2020.csv"
logger.info(f"RAW CSV path: {CSV_PATH}")

read_kwargs = dict(
    sep=";",
    low_memory=False,
    na_values=["", "NA", "NaN", "NULL", "null", "None", "nan"],
    dtype=None,
    encoding="utf-8"
)

t0 = perf_counter()
try:
    abt = pd.read_csv(CSV_PATH, **read_kwargs)
    logger.info(f"Carga CSV OK | shape={abt.shape} | tiempo={perf_counter()-t0:.2f}s")
except UnicodeDecodeError:
    abt = pd.read_csv(CSV_PATH, **{**read_kwargs, "encoding": "latin-1"})
    logger.info(f"Carga CSV con latin-1 OK | shape={abt.shape}")





2025-11-03 19:18:06 | INFO | creacion_abt | === Inicio Notebook 00: Creación ABT Agrupados ===
2025-11-03 19:18:06 | INFO | creacion_abt | === Inicio Notebook 00: Creación ABT Agrupados ===
2025-11-03 19:18:06 | INFO | creacion_abt | BASE_DIR: C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe
2025-11-03 19:18:06 | INFO | creacion_abt | BASE_DIR: C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe
2025-11-03 19:18:06 | INFO | creacion_abt | RAW CSV path: C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\data\raw\abt_PE2020.csv
2025-11-03 19:18:06 | INFO | creacion_abt | RAW CSV path: C:\Users\PC RYU\Documents\Galileo\Maestria\Product Development\repo_proyecto_pe\data\raw\abt_PE2020.csv
2025-11-03 19:18:07 | INFO | creacion_abt | Carga CSV OK | shape=(17524, 493) | tiempo=0.50s
2025-11-03 19:18:07 | INFO | creacion_abt | Carga CSV OK | shape=(17524, 493) | tiempo=0.50s


# Normalizar nombres de columnas

In [3]:
def normalize_col(s):
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = s.lower()
    s = re.sub(r"[^a-z0-9_]+", "_", s).strip("_")
    s = re.sub(r"__+", "_", s)
    return s

abt.columns = [normalize_col(c) for c in abt.columns]
logger.info(f"Total columnas tras normalizar: {len(abt.columns)}")
abt.columns[:20].tolist()


2025-11-03 19:11:47 | INFO | creacion_abt | Total columnas tras normalizar: 493


['fecha_evento',
 'fecha_inicio_d',
 'num_contratos',
 'indicador_pym_pro',
 'indiemppromoflag',
 'indiempvincuflag',
 'flag_codrpo_cuencorr',
 'flag_codpro_docdes',
 'flag_codpro_finanexp',
 'flag_codpro_finanimp',
 'flag_codpro_intauto',
 'flag_codpro_intcasa',
 'flag_codpro_intconsu',
 'dias_atraso',
 'ead_cont_nov_refina',
 'fecha_aper_con_res',
 'fecha_aprobacion',
 'fecha_canc_real',
 'fecha_desembolso',
 'fecha_fin_carencia']

# Parseo automático de fechas (columnas que contengan “fecha”)

In [5]:
fecha_cols = [c for c in abt.columns if "fecha" in c]
ok_cols, fmt_cols, fallback_cols = [], [], []

# intenta primero con formatos comunes (rápido)
COMMON_FORMATS = ("%Y-%m-%d", "%d/%m/%Y", "%Y%m%d")

for c in fecha_cols:
    s = abt[c]

    # si ya es datetime, sigue
    if pd.api.types.is_datetime64_any_dtype(s):
        ok_cols.append(c)
        continue

    parsed = None
    # 1) prueba formatos comunes (evita heurística costosa)
    for fmt in COMMON_FORMATS:
        try:
            parsed = pd.to_datetime(s, errors="raise", format=fmt)
            abt[c] = parsed
            fmt_cols.append((c, fmt))
            break
        except Exception:
            parsed = None

    # 2) fallback genérico (lento, pero flexible)
    if parsed is None:
        abt[c] = pd.to_datetime(s, errors="coerce")  # sin infer_datetime_format
        fallback_cols.append(c)

logger.info(
    f"Cols 'fecha': {len(fecha_cols)} | OK: {len(ok_cols)} | con formato fijo: {len(fmt_cols)} | fallback genérico: {len(fallback_cols)}"
)
fmt_cols[:5], fallback_cols[:5]



2025-11-03 19:14:26 | INFO | creacion_abt | Cols 'fecha': 20 | OK: 20 | con formato fijo: 0 | fallback genérico: 0


([], [])

# Metadata y perfil

In [6]:
row_count, col_count = abt.shape
na_ratio_top = abt.isna().mean().sort_values(ascending=False).head(20)
target_candidates = [c for c in ["default_12m","default","flag_sitcont_default"] if c in abt.columns]

meta = {
    "rows": int(row_count),
    "cols": int(col_count),
    "candidate_targets": target_candidates,
    "fecha_columns": fecha_cols,
    "na_top20": na_ratio_top.to_dict()
}

(METADATA := INTERIM_DIR / "metadata_abt_PE2020.json")
with open(METADATA, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

# Perfil resumido
profile = pd.DataFrame({
    "dtype": abt.dtypes.astype(str),
    "nunique": abt.nunique(dropna=True),
    "na_ratio": abt.isna().mean()
}).sort_values("na_ratio", ascending=False)

(PROFILE := INTERIM_DIR / "profile_abt_PE2020.csv")
profile.to_csv(PROFILE, index=True)

logger.info(f"Metadata: {METADATA.name} | Profile: {PROFILE.name} | targets={target_candidates}")
profile.head(10)


2025-11-03 19:15:09 | INFO | creacion_abt | Metadata: metadata_abt_PE2020.json | Profile: profile_abt_PE2020.csv | targets=['default_12m', 'default', 'flag_sitcont_default']


Unnamed: 0,dtype,nunique,na_ratio
saldo_tactivo_6var,float64,0,1.0
saldo_tactivo,float64,0,1.0
siniincicap_d_6max,float64,0,1.0
siniincicap_d_12max,float64,0,1.0
raz_apalanc_6m,float64,0,1.0
razon_circulante_6m,float64,0,1.0
raz_patnet_pas_12m,float64,0,1.0
siniincicap_d_12m,float64,0,1.0
siniincicap_d_6med,float64,0,1.0
siniincicap_d_12med,float64,0,1.0


# Seleccionar TARGET (0/1) y filtrar NA en target

In [7]:
TARGET = None
for c in ["default_12m","default","flag_sitcont_default"]:
    if c in abt.columns:
        TARGET = c
        break
assert TARGET is not None, "No se encontró columna de target (p.ej. default_12m)."

if abt[TARGET].dtype == "O":
    abt[TARGET] = (abt[TARGET].astype(str).str.strip().str.lower()
                   .map({"1":1,"0":0,"si":1,"sí":1,"no":0,"true":1,"false":0}))

abt[TARGET] = pd.to_numeric(abt[TARGET], errors="coerce").astype("Int64")
antes = len(abt)
abt = abt[abt[TARGET].notna()].copy()
abt[TARGET] = abt[TARGET].astype(int)
logger.info(f"TARGET={TARGET} | Filas removidas por NA target: {antes-len(abt)} | filas finales: {len(abt)}")
logger.info(f"Distribución target:\n{abt[TARGET].value_counts().to_string()}")
logger.info(f"Ratio positivos: {abt[TARGET].mean():.6f}")


2025-11-03 19:15:36 | INFO | creacion_abt | TARGET=default_12m | Filas removidas por NA target: 0 | filas finales: 17524
2025-11-03 19:15:36 | INFO | creacion_abt | Distribución target:
default_12m
0    16600
1      924
2025-11-03 19:15:36 | INFO | creacion_abt | Ratio positivos: 0.052728


# Elegir columna temporal de referencia

In [9]:
ref_date_col = None
candidatas = [c for c in abt.columns if "fecha" in c]
if "fecha_evento" in abt.columns:
    ref_date_col = "fecha_evento"
elif candidatas:
    ref_date_col = pd.Series({c: abt[c].notna().sum() for c in candidatas}).sort_values(ascending=False).index[0]

if ref_date_col:
    logger.info(f"Fecha ref: {ref_date_col} | rango: {abt[ref_date_col].min()} → {abt[ref_date_col].max()}")
else:
    logger.info("Sin columna temporal de referencia.")
ref_date_col


2025-11-03 19:16:03 | INFO | creacion_abt | Fecha ref: fecha_evento | rango: 2014-02-28 00:00:00 → 2017-12-31 00:00:00


'fecha_evento'

# Limpieza mínima

In [10]:
MAX_NA = 0.95
na_ratio = abt.isna().mean()
drop_na_cols = na_ratio[na_ratio > MAX_NA].index.tolist()

nunique = abt.nunique(dropna=True)
drop_const_cols = nunique[nunique <= 1].index.tolist()

protected = {TARGET}
if ref_date_col: protected.add(ref_date_col)

cols_to_drop = sorted(set([c for c in drop_na_cols + drop_const_cols if c not in protected]))
abt_clean = abt.drop(columns=cols_to_drop).copy()

# Guardar registro
pd.Series(cols_to_drop, name="cols_dropped").to_csv(INTERIM_DIR / "cols_removidas_minima.csv", index=False)
logger.info(f"Drop por NA>95%: {len(drop_na_cols)} | por constantes: {len(drop_const_cols)} | total drop: {len(cols_to_drop)}")
logger.info(f"Shape tras limpieza: {abt_clean.shape}")


2025-11-03 19:16:27 | INFO | creacion_abt | Drop por NA>95%: 44 | por constantes: 43 | total drop: 55
2025-11-03 19:16:27 | INFO | creacion_abt | Shape tras limpieza: (17524, 438)


# Tipificación para WOE (numéricas / categóricas) y alta cardinalidad

In [14]:
exclude = {TARGET}
if ref_date_col:
    exclude.add(ref_date_col)

num_cols = [
    c for c in abt_clean.columns
    if c not in exclude and is_numeric_dtype(abt_clean[c])
]

cat_cols = []
for c in abt_clean.columns:
    if c in exclude:
        continue
    dtype = abt_clean[c].dtype
    if dtype == "O" or isinstance(dtype, CategoricalDtype) or is_bool_dtype(abt_clean[c]):
        cat_cols.append(c)

# Manejo de alta cardinalidad
HIGH_CARD = 200
cat_highcard = [c for c in cat_cols if abt_clean[c].nunique(dropna=True) > HIGH_CARD]
cat_cols = [c for c in cat_cols if c not in cat_highcard]

# Guardar resultados
pd.Series(num_cols).to_csv(INTERIM_DIR / "vars_numericas_candidatas.csv", index=False)
pd.Series(cat_cols).to_csv(INTERIM_DIR / "vars_categoricas_candidatas.csv", index=False)
pd.Series(cat_highcard).to_csv(INTERIM_DIR / "vars_categoricas_alta_card.csv", index=False)

logger.info(
    f"Numéricas: {len(num_cols)} | Categóricas: {len(cat_cols)} | Alta cardinalidad: {len(cat_highcard)}"
)

2025-11-03 19:18:13 | INFO | creacion_abt | Numéricas: 421 | Categóricas: 1 | Alta cardinalidad: 2
2025-11-03 19:18:13 | INFO | creacion_abt | Numéricas: 421 | Categóricas: 1 | Alta cardinalidad: 2


# Persistir salidas de 00

In [15]:
# versión “clean” completa (solo con nombres normalizados)
(CLEAN_FULL := INTERIM_DIR / "abt_PE2020_clean.csv")
abt.to_csv(CLEAN_FULL, index=False)

# versión “clean mínima” para 01 (post drop NA/constantes)
(CLEAN_MIN := INTERIM_DIR / "abt_PE2020_clean_min.csv")
abt_clean.to_csv(CLEAN_MIN, index=False)

# sample para pruebas
SAMPLE_PATH = INTERIM_DIR / "abt_PE2020_sample.csv"
abt.sample(min(1000, len(abt)), random_state=42).to_csv(SAMPLE_PATH, index=False)

logger.info(f"Guardado: {CLEAN_FULL.name}, {CLEAN_MIN.name}, {SAMPLE_PATH.name}")
(CLEAN_FULL, CLEAN_MIN, SAMPLE_PATH)


2025-11-03 19:19:30 | INFO | creacion_abt | Guardado: abt_PE2020_clean.csv, abt_PE2020_clean_min.csv, abt_PE2020_sample.csv
2025-11-03 19:19:30 | INFO | creacion_abt | Guardado: abt_PE2020_clean.csv, abt_PE2020_clean_min.csv, abt_PE2020_sample.csv


(WindowsPath('C:/Users/PC RYU/Documents/Galileo/Maestria/Product Development/repo_proyecto_pe/data/interim/abt_PE2020_clean.csv'),
 WindowsPath('C:/Users/PC RYU/Documents/Galileo/Maestria/Product Development/repo_proyecto_pe/data/interim/abt_PE2020_clean_min.csv'),
 WindowsPath('C:/Users/PC RYU/Documents/Galileo/Maestria/Product Development/repo_proyecto_pe/data/interim/abt_PE2020_sample.csv'))