In [9]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Feature engineering optimizado y “future‑proof” para el proyecto Prometeo.
Genera ../data/processed/Pipeline_test1.csv
"""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
import logging

# ------------------------------------------------------------------
# Configuración de registro
# ------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# ------------------------------------------------------------------
# Parámetros globales
# ------------------------------------------------------------------
DATA_RAW   = Path("../data/raw")
DATA_PROD  = Path("../data/processed")
DATA_PROD.mkdir(exist_ok=True, parents=True)

REFERENCE_DATE = pd.Timestamp("2024-01-01")

# ------------------------------------------------------------------
# 1. Carga de datos
# ------------------------------------------------------------------
transactions = pd.read_csv(DATA_RAW / "transactions.csv",
                           parse_dates=["date"])
demographics = pd.read_csv(DATA_RAW / "demographics.csv")
products     = pd.read_csv(DATA_RAW / "products.csv",
                           parse_dates=["contract_date"])
logging.info("Archivos cargados")

# ------------------------------------------------------------------
# 2. DEMOGRAPHICS – age_range_sturges
# ------------------------------------------------------------------
breaks  = np.linspace(18, 70, 9)
labels  = ["18–24", "25–31", "32–38", "39–45",
           "46–52", "53–59", "60–66", "67–70"]

demographics["age_range_sturges"] = pd.cut(
    demographics["age"],
    bins=breaks,
    labels=labels,
    right=True,
    include_lowest=True
)

# ------------------------------------------------------------------
# 3. PRODUCTS – flags, fechas y métricas
# ------------------------------------------------------------------
p = products.copy()
p["product_type"] = p["product_type"].replace("investment_account", "investment")

prod_agg = (
    p.sort_values(["user_id", "contract_date"])
     .groupby("user_id")
     .agg(
         primer_producto        = ("product_type", "first"),
         fecha_primer_producto  = ("contract_date", "first"),
         segundo_producto       = ("product_type", lambda s: s.iloc[1] if len(s) > 1 else "none"),
         fecha_segundo_producto = ("contract_date", lambda s: s.iloc[1] if len(s) > 1 else pd.NaT),
         checking_account       = ("product_type", lambda s: int("checking_account" in set(s))),
         savings_account        = ("product_type", lambda s: int("savings_account"  in set(s))),
         credit_card            = ("product_type", lambda s: int("credit_card"      in set(s))),
         insurance              = ("product_type", lambda s: int("insurance"        in set(s))),
         investment             = ("product_type", lambda s: int("investment"       in set(s)))
     )
     .reset_index()
)

prod_agg["fecha_segundo_producto"] = prod_agg["fecha_segundo_producto"].fillna(
    pd.Timestamp("1900-01-01")
)

flag_cols = ["checking_account", "savings_account", "credit_card",
             "insurance", "investment"]

prod_agg["dias_entre_productos"] = (
    (prod_agg["fecha_segundo_producto"] - prod_agg["fecha_primer_producto"])
    .dt.days.fillna(0)
)
prod_agg["antiguedad_cliente"] = (
    (REFERENCE_DATE - prod_agg["fecha_primer_producto"]).dt.days
)
prod_agg["numero_productos"] = prod_agg[flag_cols].sum(axis=1)

def _combo(row):
    activos = [c for c in flag_cols if row[c] == 1]
    return "sin_productos" if not activos else " + ".join(activos)

prod_agg["combinacion_productos"] = prod_agg.apply(_combo, axis=1)

mapeo = {
    "checking_account"                    : "checking_account",
    "savings_account"                     : "savings_account",
    "savings_account + credit_card"       : "credit_card + savings_account",
    "savings_account + insurance"         : "insurance + savings_account",
    "checking_account + insurance"        : "checking_account + insurance",
    "checking_account + credit_card"      : "checking_account + credit_card",
    "checking_account + investment"       : "checking_account + investment",
    "savings_account + investment"        : "investment + savings_account"
}
prod_agg["combinacion_productos"] = prod_agg["combinacion_productos"].map(
    lambda x: mapeo.get(x, "OTRA_COMBINACION")
)

# ------------------------------------------------------------------
# 4. TRANSACTIONS – estadísticas por usuario
# ------------------------------------------------------------------
cat_counts = (
    transactions.pivot_table(
        index="user_id",
        columns="merchant_category",
        values="transaction_id",
        aggfunc="count",
        fill_value=0
    )
    .add_suffix("_count")
    .reset_index()
)

tx_basic = (
    transactions.groupby("user_id")
      .agg(
          total_transacciones        = ("transaction_id", "count"),
          monto_promedio_transaccion = ("amount", "mean"),
          total_spend                = ("amount", "sum"),
          n_meses_activos            = ("date", lambda s: s.dt.to_period("M").nunique()),
          recencia_transaccion       = ("date", lambda s: (REFERENCE_DATE - s.max()).days)
      )
      .reset_index()
)

cnt_cols = [c for c in cat_counts.columns if c.endswith("_count")]
cat_counts["categoria_favorita"] = (
    cat_counts.set_index("user_id")[cnt_cols]
              .idxmax(axis=1)
              .str.replace("_count", "")
              .values
)

sp_by_cat = (transactions
             .groupby(["user_id", "merchant_category"])["amount"]
             .sum()
             .unstack(fill_value=0))
fav_monto = sp_by_cat.idxmax(axis=1)
sp_fav    = sp_by_cat.max(axis=1)
hhi       = (sp_by_cat.div(sp_by_cat.sum(axis=1), axis=0)**2).sum(axis=1)

tx_money = (
    pd.DataFrame({
        "user_id"                  : sp_by_cat.index,
        "categoria_favorita_monto" : fav_monto,
        "total_spend_fav"          : sp_fav,
        "hhi"                      : hhi
    })
    .reset_index(drop=True)
)

tx_month = transactions.copy()
tx_month["mes"] = tx_month["date"].dt.to_period("M").dt.to_timestamp()

month_agg = (tx_month
    .groupby(["user_id", "mes"])
    .agg(
        monto_mes         = ("amount", "sum"),
        transacciones_mes = ("transaction_id", "count")
    )
    .reset_index()
)

m_best = (month_agg
          .sort_values(["user_id", "transacciones_mes"], ascending=[True, False])
          .drop_duplicates("user_id")
          .loc[:, ["user_id", "mes"]]
          .rename(columns={"mes": "mes_mas_compras"}))

m_best_amt = (month_agg
              .sort_values(["user_id", "monto_mes"], ascending=[True, False])
              .drop_duplicates("user_id")
              .loc[:, ["user_id", "mes"]]
              .rename(columns={"mes": "mes_mayor_monto"}))

month_agg.sort_values(["user_id", "mes"], inplace=True)
month_agg["diff"] = month_agg.groupby("user_id")["monto_mes"].diff()
month_agg["pct"]  = month_agg.groupby("user_id")["monto_mes"].pct_change()

diffs = (month_agg.groupby("user_id")
         .agg(
             variacion_mensual_promedio     = ("diff", "mean"),
             variacion_mensual_promedio_pct = ("pct",  "mean")
         )
         .reset_index())

tx_full = (tx_basic
    .merge(cat_counts, on="user_id", how="left")
    .merge(tx_money,   on="user_id", how="left")
    .merge(m_best,     on="user_id", how="left")
    .merge(m_best_amt, on="user_id", how="left")
    .merge(diffs,      on="user_id", how="left")
)

tx_full["share_fav"] = tx_full["total_spend_fav"] / tx_full["total_spend"]
logging.info("Transacciones agregadas")

# ------------------------------------------------------------------
# 5. DATASET FINAL y transformaciones
# ------------------------------------------------------------------
df = (demographics
      .merge(prod_agg, on="user_id", how="left")
      .merge(tx_full,  on="user_id", how="left"))

# Variables a descartar (incluye user_id y fecha_segundo_producto_ts)
drop_cols = [
    "fecha_segundo_producto_ts",
    "total_spend", "monto_promedio_transaccion", "monto_promedio_mensual",
    "hhi", "share_fav", "total_transacciones", "categoria_favorita"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

# Fechas -> timestamp
date_cols = ["fecha_primer_producto", "fecha_segundo_producto",
             "mes_mas_compras", "mes_mayor_monto"]
for c in date_cols:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")
        df[f"{c}_ts"] = df[c].astype("int64")
        df.drop(columns=c, inplace=True)

# Imputaciones
df["dias_entre_productos"]           = df["dias_entre_productos"].fillna(0)
df["variacion_mensual_promedio"]     = df["variacion_mensual_promedio"].fillna(0)
df["variacion_mensual_promedio_pct"] = df["variacion_mensual_promedio_pct"].fillna(0)

# Escalado
scaler = StandardScaler()
to_scale = [
    "age", "dias_entre_productos", "antiguedad_cliente",
    "numero_productos", "recencia_transaccion",
    "variacion_mensual_promedio", "variacion_mensual_promedio_pct"
]
present = [c for c in to_scale if c in df.columns]
df[present] = scaler.fit_transform(df[present])

# Log1p
if "total_spend_fav" in df.columns:
    df["total_spend_fav"] = np.log1p(df["total_spend_fav"])

# Flags a bool
for b in flag_cols:
    if b in df.columns:
        df[b] = df[b].astype(bool)

# Label encoding
cat_vars = [
    "income_range", "risk_profile", "occupation", "age_range_sturges",
    "primer_producto", "segundo_producto", "combinacion_productos",
    "categoria_favorita_monto"
]
for c in cat_vars:
    if c in df.columns:
        df[c] = LabelEncoder().fit_transform(df[c].astype(str))

df.drop(columns=["combinacion_productos"], errors="ignore", inplace=True)

# ------------------------------------------------------------------
# 6. Guardar
# ------------------------------------------------------------------
outfile = DATA_PROD / "Pipeline_test1.csv"
df.to_csv(outfile, index=False)
logging.info(f"Dataset final guardado: {outfile}  ({df.shape[0]} filas, {df.shape[1]} columnas)")

2025-04-17 21:44:58,044 - INFO - Archivos cargados
2025-04-17 21:44:58,123 - INFO - Transacciones agregadas
2025-04-17 21:44:58,139 - INFO - Dataset final guardado: ../data/processed/Pipeline_test1.csv  (100 filas, 33 columnas)
