# An√°lisis Exploratorio de Datos (EDA)
# Marketing Campaign Response Prediction

---

## Contexto del Negocio

Este an√°lisis exploratorio tiene como objetivo entender el comportamiento de los clientes frente a campa√±as de marketing.

**Variable Objetivo**: `Response` (1 = acepta la oferta, 0 = rechaza)

---

In [None]:
# Manipulaci√≥n de datos
import pandas as pd
import numpy as np
import json

# Visualizaci√≥n
import matplotlib.pyplot as plt
import seaborn as sns

# Estad√≠sticas
from scipy import stats
from scipy.stats import chi2_contingency

# Configuraci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.rcParams['figure.figsize'] = (12, 6)

import warnings
warnings.filterwarnings('ignore')

print('‚úÖ Librer√≠as importadas correctamente')

In [None]:
# Cargar configuraci√≥n
with open('../../config.json', 'r') as f:
    config = json.load(f)

# Cargar dataset
data_path = f'../../{config["data_path"]}'
df = pd.read_csv(data_path, sep=';')

print(f'Dataset cargado: {df.shape[0]} filas √ó {df.shape[1]} columnas')
df.head()

## 1. EXPLORACI√ìN INICIAL

In [None]:
# Informaci√≥n del dataset
print('\nüìã Informaci√≥n del Dataset:\n')
df.info()

print('\nüìä Estad√≠sticas Descriptivas:\n')
df.describe()

In [None]:
# An√°lisis de valores nulos
nulos = df.isnull().sum()
nulos_pct = (nulos / len(df) * 100).round(2)

nulos_df = pd.DataFrame({
    'Variable': nulos.index,
    'Nulos': nulos.values,
    '% Nulos': nulos_pct.values
}).sort_values('Nulos', ascending=False)

nulos_df = nulos_df[nulos_df['Nulos'] > 0]

if len(nulos_df) > 0:
    print('‚ö†Ô∏è VARIABLES CON VALORES NULOS:\n')
    print(nulos_df.to_string(index=False))
else:
    print('‚úÖ No hay valores nulos')

## 2. AN√ÅLISIS DE LA VARIABLE OBJETIVO

In [None]:
# An√°lisis de Response
print('üéØ AN√ÅLISIS DE RESPONSE\n')
response_counts = df['Response'].value_counts()
response_pct = df['Response'].value_counts(normalize=True) * 100

print(f'No acepta (0): {response_counts[0]} ({response_pct[0]:.2f}%)')
print(f'Acepta (1):    {response_counts[1]} ({response_pct[1]:.2f}%)')

# Visualizaci√≥n
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
response_counts.plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('Distribuci√≥n de Response')
axes[0].set_xticklabels(['No acepta (0)', 'Acepta (1)'], rotation=0)

axes[1].pie(response_counts, labels=['No acepta (0)', 'Acepta (1)'], 
           autopct='%1.1f%%', colors=['#FF6B6B', '#4ECDC4'])
axes[1].set_title('Proporci√≥n de Response')
plt.tight_layout()
plt.show()

## 3. AN√ÅLISIS UNIVARIABLE

In [None]:
# Variables de gastos
variables_gastos = ['MntWines', 'MntFruits', 'MntMeatProducts', 
                    'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']

# Distribuciones de gastos
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, var in enumerate(variables_gastos):
    axes[idx].hist(df[var], bins=30, color='purple', alpha=0.7)
    axes[idx].axvline(df[var].mean(), color='red', linestyle='--', 
                     label=f'Media: {df[var].mean():.0f}')
    axes[idx].set_title(f'Distribuci√≥n de {var}')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 4. AN√ÅLISIS BIVARIABLE

In [None]:
# Response vs Campa√±as anteriores
campanas = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 
           'AcceptedCmp4', 'AcceptedCmp5']

df['TotalCampaignsAccepted'] = df[campanas].sum(axis=1)

response_by_campaigns = df.groupby('TotalCampaignsAccepted')['Response'].mean() * 100

fig, ax = plt.subplots(figsize=(12, 6))
response_by_campaigns.plot(kind='bar', ax=ax, color='steelblue')
ax.set_title('Tasa de Response seg√∫n Campa√±as Anteriores Aceptadas')
ax.set_xlabel('Total de Campa√±as Aceptadas')
ax.set_ylabel('% Response = 1')
plt.tight_layout()
plt.show()

In [None]:
# Response vs Income
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

df.boxplot(column='Income', by='Response', ax=axes[0])
axes[0].set_title('Income por Response')

df[df['Response']==0]['Income'].hist(bins=30, alpha=0.5, 
                                     label='No acepta', ax=axes[1], color='red')
df[df['Response']==1]['Income'].hist(bins=30, alpha=0.5, 
                                     label='Acepta', ax=axes[1], color='green')
axes[1].set_title('Distribuci√≥n de Income por Response')
axes[1].legend()

plt.tight_layout()
plt.show()

## 5. AN√ÅLISIS MULTIVARIABLE

In [None]:
# Matriz de correlaci√≥n
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

fig, ax = plt.subplots(figsize=(16, 14))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', 
           cmap='coolwarm', center=0, ax=ax)
ax.set_title('Matriz de Correlaci√≥n')
plt.tight_layout()
plt.show()

# Correlaciones con Response
print('\nüîó Top 10 Correlaciones con Response:\n')
response_corr = correlation_matrix['Response'].drop('Response').sort_values(ascending=False)
print(response_corr.head(10))

## 6. FEATURES DERIVADOS

In [None]:
# Crear features derivados
df['TotalSpent'] = df[variables_gastos].sum(axis=1)
df['TotalPurchases'] = df[['NumDealsPurchases', 'NumWebPurchases', 
                           'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)
df['AvgPurchaseValue'] = df['TotalSpent'] / (df['TotalPurchases'] + 1)
df['HasChildren'] = ((df['Kidhome'] + df['Teenhome']) > 0).astype(int)
df['Age'] = 2014 - df['Year_Birth']

print('‚úÖ Features derivados creados:')
print('  - TotalSpent')
print('  - TotalPurchases')
print('  - AvgPurchaseValue')
print('  - HasChildren')
print('  - Age')
print('  - TotalCampaignsAccepted')

# Correlaci√≥n de features derivados
derived_features = ['TotalSpent', 'TotalPurchases', 'AvgPurchaseValue', 
                   'HasChildren', 'Age', 'TotalCampaignsAccepted']
derived_corr = df[derived_features + ['Response']].corr()['Response'].drop('Response')
print('\nCorrelaci√≥n con Response:')
print(derived_corr.sort_values(ascending=False))

## 7. CONCLUSIONES

### Hallazgos Clave:

1. **Dataset desbalanceado**: ~15% acepta vs ~85% no acepta
2. **Variables m√°s predictivas**:
   - TotalCampaignsAccepted (campa√±as anteriores)
   - Income (ingresos)
   - MntWines (gasto en vinos)
   - Recency (d√≠as desde √∫ltima compra)

3. **Perfil del cliente que acepta**:
   - Ingresos m√°s altos
   - Mayor gasto total
   - Ha aceptado campa√±as anteriores
   - Compras recientes

4. **Transformaciones necesarias**:
   - Imputaci√≥n de nulos en Income
   - Escalado de variables num√©ricas
   - Encoding de variables categ√≥ricas
   - Estratificaci√≥n por desbalance

### Pr√≥ximos Pasos:
- ‚úÖ Fase 1: EDA completado
- ‚è≠Ô∏è Fase 2: Feature Engineering
- ‚è≠Ô∏è Fase 3: Entrenamiento de Modelos

---

**Autor**: Alejandro Pineda Alvarez  
**Proyecto**: Marketing Campaign Response Prediction

In [None]:
# Guardar dataset con features derivados
output_path = '../../data_with_features.csv'
df.to_csv(output_path, index=False)
print(f'‚úÖ Dataset guardado en: {output_path}')
print(f'Dimensiones: {df.shape}')