# Feature engineering Random Forest

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [12]:
cols_a_eliminar = [
    "Unnamed: 0", "user_id",
]

fechas = [
    "fecha_primer_producto", "fecha_segundo_producto",
    "mes_mas_compras.x", "mes_mayor_monto.x"
]

binarias_explicit = ['checking_account', 'savings_account', 'credit_card', 'investment']

In [13]:
df = pd.read_csv("../data/interim/df_features_general.csv")
df.drop(columns=[col for col in cols_a_eliminar if col in df.columns], inplace=True)

## Transformaciones

In [14]:
# Convertir fechas
for col in fechas:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Log1p a total_spend_fav
if 'total_spend_fav' in df.columns:
    df['total_spend_fav'] = np.log1p(df['total_spend_fav'])

# Binarias a bool
for col in binarias_explicit:
    if col in df.columns:
        df[col] = df[col].map({1: True, 0: False}).astype(bool)

# Categóricas y label encoder
categoricas = [
    'income_range', 'risk_profile', 'occupation', 'age_range_sturges',
    'primer_producto', 'segundo_producto', 'combinacion_productos',
    'categoria_favorita_monto'
]
for col in categoricas:
    if col in df.columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

## Data sets

In [15]:
# Separar features y target
y = df['insurance']
X = df.drop(columns=['insurance'])

X.to_csv("../data/processed/X_rf_reduced.csv", index=False)
y.to_csv("../data/processed/y_rf_reduced.csv", index=False)