# Fase 2: Ingenier√≠a de Caracter√≠sticas (Feature Engineering)
# Marketing Campaign Response Prediction

---

## Objetivo

Este notebook implementa el pipeline completo de ingenier√≠a de caracter√≠sticas:

1. **Limpieza de datos**: Manejo de nulos, outliers, inconsistencias
2. **Creaci√≥n de features derivados**: Nuevas variables calculadas
3. **Transformaci√≥n de variables**: Escalado y encoding
4. **Split de datos**: Train/Test estratificado
5. **Pipeline de preprocesamiento**: Listo para modelado

---

In [None]:
# Manipulaci√≥n de datos
import pandas as pd
import numpy as np
import json

# Machine Learning - Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Model persistence
import joblib

# Configuraci√≥n
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

print('‚úÖ Librer√≠as importadas correctamente')

## 2. Carga de Datos

In [None]:
# Cargar configuraci√≥n
with open('../../config.json', 'r') as f:
    config = json.load(f)

print('Configuraci√≥n cargada')

In [None]:
# Cargar dataset
# Opci√≥n 1: Cargar desde Base_de_datos.csv (si no ejecutaste Fase 1)
# data_path = f'../../{config["data_path"]}'
# df = pd.read_csv(data_path, sep=';')

# Opci√≥n 2: Cargar dataset con features derivados de la Fase 1 (recomendado)
try:
    df = pd.read_csv('../../data_with_features.csv')
    print('‚úÖ Dataset con features derivados cargado desde data_with_features.csv')
except FileNotFoundError:
    # Si no existe, cargar desde Base_de_datos.csv
    data_path = f'../../{config["data_path"]}'
    df = pd.read_csv(data_path, sep=';')
    print('‚úÖ Dataset cargado desde Base_de_datos.csv')

print(f'\nDimensiones: {df.shape[0]} filas √ó {df.shape[1]} columnas')
df.head()

---
## 3. LIMPIEZA DE DATOS

In [None]:
print('\n' + '='*80)
print('LIMPIEZA DE DATOS')
print('='*80)
print(f'\nDimensiones iniciales: {df.shape}')

In [None]:
# Eliminar variables irrelevantes
cols_to_drop = ['ID', 'Z_CostContact', 'Z_Revenue']
cols_to_drop = [col for col in cols_to_drop if col in df.columns]

if cols_to_drop:
    df_clean = df.drop(columns=cols_to_drop)
    print(f'\n‚úÖ Variables eliminadas: {cols_to_drop}')
else:
    df_clean = df.copy()
    print('\n‚úÖ No hay variables irrelevantes para eliminar')

print(f'Dimensiones despu√©s de limpieza: {df_clean.shape}')

In [None]:
# Convertir tipos de datos
if 'Dt_Customer' in df_clean.columns:
    df_clean['Dt_Customer'] = pd.to_datetime(df_clean['Dt_Customer'], format='%Y-%m-%d', errors='coerce')
    print('‚úÖ Dt_Customer convertido a datetime')

# Convertir variables categ√≥ricas
if 'Education' in df_clean.columns:
    df_clean['Education'] = df_clean['Education'].astype('category')
if 'Marital_Status' in df_clean.columns:
    df_clean['Marital_Status'] = df_clean['Marital_Status'].astype('category')

print('‚úÖ Tipos de datos convertidos')

In [None]:
# Manejo de valores nulos
if 'Income' in df_clean.columns:
    nulos_income = df_clean['Income'].isnull().sum()
    if nulos_income > 0:
        print(f'\n‚ö†Ô∏è Valores nulos en Income: {nulos_income} ({nulos_income/len(df_clean)*100:.2f}%)')
        median_income = df_clean['Income'].median()
        df_clean['Income'] = df_clean['Income'].fillna(median_income)
        print(f'‚úÖ Imputados con mediana: {median_income:.2f}')
    else:
        print('\n‚úÖ Income no tiene valores nulos')

In [None]:
# Unificar categor√≠as en Education
if 'Education' in df_clean.columns:
    education_mapping = {
        '2n Cycle': 'Undergraduate',
        'Basic': 'Basic',
        'Graduation': 'Graduate',
        'Master': 'Postgraduate',
        'PhD': 'Postgraduate'
    }
    df_clean['Education'] = df_clean['Education'].map(education_mapping)
    print('\n‚úÖ Education unificado')
    print(df_clean['Education'].value_counts())

In [None]:
# Unificar categor√≠as en Marital_Status
if 'Marital_Status' in df_clean.columns:
    marital_mapping = {
        'Single': 'Single',
        'Together': 'Relationship',
        'Married': 'Relationship',
        'Divorced': 'Single',
        'Widow': 'Single',
        'Alone': 'Single',
        'Absurd': 'Other',
        'YOLO': 'Other'
    }
    df_clean['Marital_Status'] = df_clean['Marital_Status'].map(
        lambda x: marital_mapping.get(x, 'Other')
    )
    print('\n‚úÖ Marital_Status unificado')
    print(df_clean['Marital_Status'].value_counts())

---
## 4. CREACI√ìN DE FEATURES DERIVADOS

In [None]:
print('\n' + '='*80)
print('CREACI√ìN DE FEATURES DERIVADOS')
print('='*80)

In [None]:
# Features de gastos y compras
gastos_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 
               'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
gastos_cols = [col for col in gastos_cols if col in df_clean.columns]

purchases_cols = ['NumDealsPurchases', 'NumWebPurchases', 
                 'NumCatalogPurchases', 'NumStorePurchases']
purchases_cols = [col for col in purchases_cols if col in df_clean.columns]

campaigns_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 
                 'AcceptedCmp4', 'AcceptedCmp5']
campaigns_cols = [col for col in campaigns_cols if col in df_clean.columns]

# Crear features
if 'TotalSpent' not in df_clean.columns and gastos_cols:
    df_clean['TotalSpent'] = df_clean[gastos_cols].sum(axis=1)
    print('‚úÖ TotalSpent creado')

if 'TotalPurchases' not in df_clean.columns and purchases_cols:
    df_clean['TotalPurchases'] = df_clean[purchases_cols].sum(axis=1)
    print('‚úÖ TotalPurchases creado')

if 'AvgPurchaseValue' not in df_clean.columns:
    df_clean['AvgPurchaseValue'] = df_clean['TotalSpent'] / (df_clean['TotalPurchases'] + 1)
    print('‚úÖ AvgPurchaseValue creado')

if 'TotalCampaignsAccepted' not in df_clean.columns and campaigns_cols:
    df_clean['TotalCampaignsAccepted'] = df_clean[campaigns_cols].sum(axis=1)
    print('‚úÖ TotalCampaignsAccepted creado')

if 'HasChildren' not in df_clean.columns:
    df_clean['HasChildren'] = ((df_clean['Kidhome'] + df_clean['Teenhome']) > 0).astype(int)
    print('‚úÖ HasChildren creado')

if 'TotalChildren' not in df_clean.columns:
    df_clean['TotalChildren'] = df_clean['Kidhome'] + df_clean['Teenhome']
    print('‚úÖ TotalChildren creado')

In [None]:
# Features temporales
if 'Age' not in df_clean.columns and 'Year_Birth' in df_clean.columns:
    df_clean['Age'] = 2014 - df_clean['Year_Birth']
    print('‚úÖ Age creado')

if 'CustomerTenure' not in df_clean.columns and 'Dt_Customer' in df_clean.columns:
    reference_date = df_clean['Dt_Customer'].max()
    df_clean['CustomerTenure'] = (reference_date - df_clean['Dt_Customer']).dt.days
    print('‚úÖ CustomerTenure creado')

if 'WebEngagement' not in df_clean.columns:
    df_clean['WebEngagement'] = df_clean['NumWebPurchases'] / (df_clean['NumWebVisitsMonth'] + 1)
    print('‚úÖ WebEngagement creado')

if 'IncomePerPerson' not in df_clean.columns:
    df_clean['IncomePerPerson'] = df_clean['Income'] / (1 + df_clean['TotalChildren'])
    print('‚úÖ IncomePerPerson creado')

if 'SpendingRatio' not in df_clean.columns:
    df_clean['SpendingRatio'] = df_clean['TotalSpent'] / (df_clean['Income'] + 1)
    print('‚úÖ SpendingRatio creado')

print(f'\n‚úÖ Total de features derivados: 12')
print(f'Dimensiones finales: {df_clean.shape}')

---
## 5. PREPARACI√ìN PARA MODELADO

In [None]:
# Separar X e y
target_col = 'Response'
y = df_clean[target_col]
X = df_clean.drop(columns=[target_col])

# Eliminar columnas no necesarias
cols_to_drop_model = ['Dt_Customer', 'Year_Birth']
cols_to_drop_model = [col for col in cols_to_drop_model if col in X.columns]
if cols_to_drop_model:
    X = X.drop(columns=cols_to_drop_model)

print(f'Dimensiones de X: {X.shape}')
print(f'Dimensiones de y: {y.shape}')

In [None]:
# Identificar tipos de variables
numeric_features = X.select_dtypes(include=['int64', 'float64', 'int8']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f'Variables num√©ricas: {len(numeric_features)}')
print(f'Variables categ√≥ricas: {len(categorical_features)}')
print(f'\nCateg√≥ricas: {categorical_features}')

---
## 6. CREACI√ìN DE PIPELINE DE PREPROCESAMIENTO

In [None]:
# Pipeline para variables num√©ricas
use_robust_scaler = True

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler() if use_robust_scaler else StandardScaler())
])

print(f'‚úÖ Pipeline num√©rico creado')
print(f'   Scaler: {"RobustScaler" if use_robust_scaler else "StandardScaler"}')

In [None]:
# Pipeline para variables categ√≥ricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

print(f'‚úÖ Pipeline categ√≥rico creado')

In [None]:
# Combinar pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

print(f'‚úÖ Preprocessor creado')

---
## 7. SPLIT DE DATOS (TRAIN/TEST)

In [None]:
# Dividir datos (antes de transformar)
test_size = 0.2
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=test_size, 
    random_state=random_state,
    stratify=y
)

print(f'Test size: {test_size*100:.0f}%')
print(f'\nDimensiones:')
print(f'  X_train: {X_train.shape}')
print(f'  X_test:  {X_test.shape}')
print(f'  y_train: {y_train.shape}')
print(f'  y_test:  {y_test.shape}')

In [None]:
# Verificar distribuci√≥n de clases
print(f'Distribuci√≥n de clases en Train:')
print(y_train.value_counts())
print(f'\nDistribuci√≥n de clases en Test:')
print(y_test.value_counts())

In [None]:
# Transformar datos
print('Transformando datos de entrenamiento...')
X_train_transformed = preprocessor.fit_transform(X_train)

print('Transformando datos de prueba...')
X_test_transformed = preprocessor.transform(X_test)

print(f'\n‚úÖ Datos transformados')
print(f'   X_train_transformed: {X_train_transformed.shape}')
print(f'   X_test_transformed:  {X_test_transformed.shape}')

In [None]:
# Obtener nombres de features despu√©s de transformaci√≥n
feature_names = []
feature_names.extend(numeric_features)

if len(categorical_features) > 0:
    cat_encoder = preprocessor.named_transformers_['cat']['onehot']
    cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
    feature_names.extend(cat_feature_names)

print(f'Total de features: {len(feature_names)}')

---
## 8. GUARDAR RESULTADOS

In [None]:
# Guardar preprocessor
preprocessor_path = '../../preprocessor.pkl'
joblib.dump(preprocessor, preprocessor_path)
print(f'‚úÖ Preprocessor guardado en: {preprocessor_path}')

In [None]:
# Guardar dataset procesado
df_processed_path = '../../data_processed.csv'
df_clean.to_csv(df_processed_path, index=False)
print(f'‚úÖ Dataset procesado guardado en: {df_processed_path}')

In [None]:
# Guardar datos transformados (opcional)
X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)

X_train_df.to_csv('../../X_train_transformed.csv', index=False)
X_test_df.to_csv('../../X_test_transformed.csv', index=False)
y_train.to_csv('../../y_train.csv', index=False)
y_test.to_csv('../../y_test.csv', index=False)

print(f'‚úÖ Datos transformados guardados')

---
## 9. RESUMEN FINAL

In [None]:
print('\n' + '='*80)
print('RESUMEN FINAL - FASE 2')
print('='*80)

print('\n‚úÖ FASE 2 COMPLETADA EXITOSAMENTE')
print(f'\nüìä Resumen:')
print(f'  1. Dataset procesado: {df_clean.shape[0]} registros, {df_clean.shape[1]} variables')
print(f'  2. Features derivados: 12')
print(f'  3. Variables num√©ricas: {len(numeric_features)}')
print(f'  4. Variables categ√≥ricas: {len(categorical_features)}')
print(f'  5. Features despu√©s de transformaci√≥n: {len(feature_names)}')
print(f'  6. Train set: {X_train_transformed.shape}')
print(f'  7. Test set: {X_test_transformed.shape}')

print('\nüìÅ Archivos generados:')
print('  - preprocessor.pkl')
print('  - data_processed.csv')
print('  - X_train_transformed.csv')
print('  - X_test_transformed.csv')
print('  - y_train.csv')
print('  - y_test.csv')

print('\n' + '='*80)
print('üéâ FASE 2 COMPLETADA - LISTO PARA FASE 3')
print('='*80)