# Se agrupara el DataSet por cliente y por Codigo de transacción, para cambiar el enfoque del modelo y que este prediga de cada cliente su gasto por categoría

In [1]:
import pandas as pd
import numpy as np

# Cargar los datos
df = pd.read_csv('/Users/edissonpenagosospina/Downloads/dataSet_whole.csv')

In [2]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
# Volver a la escala original del monto
df["amount"] = np.expm1(df["log_amount"])

# Agrupar por cliente, categoría y mes con agregaciones enriquecidas
agg_df = df.groupby([
    "client_id", "mcc", "transaction_year", "transaction_month"
]).agg(
    # Monto total y resumen de transacciones
    total_amount=("amount", "sum"),
    num_transactions=("amount", "count"),
    avg_transaction=("amount", "mean"),
    std_transaction=("amount", "std"),
    max_transaction=("amount", "max"),
    min_transaction=("amount", "min"),

    # Comportamiento horario
    avg_hour=("transaction_hour", "mean"),
    max_hour=("transaction_hour", "max"),
    min_hour=("transaction_hour", "min"),
    weekday_freq=("transaction_weekday", "nunique"),

    # Datos de cliente (financieros)
    credit_score=("credit_score", "mean"),
    total_debt=("total_debt", "mean"),
    yearly_income=("yearly_income", "mean"),
    per_capita_income=("per_capita_income", "mean"),

    # Datos de la tarjeta
    has_chip=("has_chip", "max"),
    num_cards_issued=("num_cards_issued", "mean"),
    credit_limit=("credit_limit", "mean"),
    card_type_credit=("card_type_Credit", "mean"),
    card_type_debit=("card_type_Debit", "mean"),

    # Datos demográficos
    current_age=("current_age", "mean"),
    gender_female=("gender_Female", "mean"),
    gender_male=("gender_Male", "mean"),

    # Localización
    latitude=("latitude", "mean"),
    longitude=("longitude", "mean"),

    # Frecuencia de ciudad y estado del comercio
    merchant_city_freq=("merchant_city_freq", "mean"),
    merchant_state_freq=("merchant_state_freq", "mean"),

    # Errores de la transacción
    errors_freq=("errors_freq", "mean"),
).reset_index()

# Reemplazar valores NaN que puedan quedar por agrupaciones con pocos datos
agg_df = agg_df.fillna(0)

# Crear la variable objetivo
agg_df["log_total_amount"] = np.log1p(agg_df["total_amount"])

# Revisar el resultado
print("✅ Dataset agrupado listo para entrenamiento:")
print(agg_df.head())
print(f"\n📐 Shape final del dataset: {agg_df.shape}")




✅ Dataset agrupado listo para entrenamiento:
   client_id   mcc  transaction_year  transaction_month  total_amount  \
0          0  1711              2012                 10         62.99   
1          0  1711              2014                  2         56.04   
2          0  1711              2014                  4         51.65   
3          0  1711              2014                  7        119.79   
4          0  3000              2015                  6        309.57   

   num_transactions  avg_transaction  std_transaction  max_transaction  \
0                 1            62.99             0.00            62.99   
1                 1            56.04             0.00            56.04   
2                 1            51.65             0.00            51.65   
3                 2            59.89             7.16            64.96   
4                 1           309.57             0.00           309.57   

   min_transaction  ...  card_type_debit  current_age  gender_female  \

In [4]:
agg_df

Unnamed: 0,client_id,mcc,transaction_year,transaction_month,total_amount,num_transactions,avg_transaction,std_transaction,max_transaction,min_transaction,...,card_type_debit,current_age,gender_female,gender_male,latitude,longitude,merchant_city_freq,merchant_state_freq,errors_freq,log_total_amount
0,0,1711,2012,10,62.99,1,62.99,0.00,62.99,62.99,...,1.00,33.00,0.00,1.00,43.59,-70.33,0.05,0.32,98.41,4.16
1,0,1711,2014,2,56.04,1,56.04,0.00,56.04,56.04,...,0.00,33.00,0.00,1.00,43.59,-70.33,0.05,0.32,98.41,4.04
2,0,1711,2014,4,51.65,1,51.65,0.00,51.65,51.65,...,0.00,33.00,0.00,1.00,43.59,-70.33,0.05,0.32,98.41,3.96
3,0,1711,2014,7,119.79,2,59.89,7.16,64.96,54.83,...,0.50,33.00,0.00,1.00,43.59,-70.33,0.05,0.32,98.41,4.79
4,0,3000,2015,6,309.57,1,309.57,0.00,309.57,309.57,...,1.00,33.00,0.00,1.00,43.59,-70.33,0.05,1.03,98.41,5.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2819015,1998,8043,2017,5,109.45,1,109.45,0.00,109.45,109.45,...,1.00,65.00,0.00,1.00,37.56,-122.37,0.02,10.73,98.41,4.70
2819016,1998,8049,2014,7,69.71,1,69.71,0.00,69.71,69.71,...,0.00,65.00,0.00,1.00,37.56,-122.37,0.02,10.73,98.41,4.26
2819017,1998,8099,2016,5,5.13,1,5.13,0.00,5.13,5.13,...,1.00,65.00,0.00,1.00,37.56,-122.37,0.02,10.73,98.41,1.81
2819018,1998,8099,2017,12,4.31,1,4.31,0.00,4.31,4.31,...,0.00,65.00,0.00,1.00,37.56,-122.37,0.02,10.73,98.41,1.67


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Cargar el dataset ya agrupado
agg_df = pd.read_csv("dataSet_agrupado.csv")  # o usa el agg_df generado en memoria si no lo guardaste

# Asegurarse de que no haya NaN
agg_df = agg_df.fillna(0)

# Separar features y target
X = agg_df.drop(columns=[
    "total_amount", "log_total_amount", "client_id", "mcc", "transaction_year", "transaction_month"
])
y = agg_df["log_total_amount"]

# Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Crear y entrenar el modelo Random Forest
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

print("🚀 Entrenando modelo Random Forest...")
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)

# Evaluación en escala original
y_true_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred)

mae = mean_absolute_error(y_true_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_true_original, y_pred_original))
r2 = r2_score(y_test, y_pred)

print("\n📊 Evaluación del modelo:")
print(f"MAE (escala original): {mae:.2f}")
print(f"RMSE (escala original): {rmse:.2f}")
print(f"R² (log escala): {r2:.4f}")


🚀 Entrenando modelo Random Forest...

📊 Evaluación del modelo:
MAE (escala original): 0.27
RMSE (escala original): 5.86
R² (log escala): 1.0000


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Cargar el dataset agrupado
agg_df = agg_df.fillna(0)

# Columnas que queremos eliminar para evitar sobreajuste
cols_to_remove = [
    "total_amount", "log_total_amount", "client_id", "mcc",
    "transaction_year", "transaction_month",
    "avg_transaction", "num_transactions",
    "max_transaction", "min_transaction", "std_transaction"
]

# Separar features y target
X = agg_df.drop(columns=cols_to_remove)
y = agg_df["log_total_amount"]

# Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Entrenar el modelo
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

print("🚀 Entrenando modelo sin columnas que inducen sobreajuste...")
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)

# Evaluación en escala original
y_true_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred)

mae = mean_absolute_error(y_true_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_true_original, y_pred_original))
r2 = r2_score(y_test, y_pred)

print("\n📊 Evaluación del modelo sin sobreajuste:")
print(f"MAE (escala original): {mae:.2f}")
print(f"RMSE (escala original): {rmse:.2f}")
print(f"R² (log escala): {r2:.4f}")


🚀 Entrenando modelo sin columnas que inducen sobreajuste...

📊 Evaluación del modelo sin sobreajuste:
MAE (escala original): 80.73
RMSE (escala original): 173.93
R² (log escala): 0.7103
