# üìä EDA - Credit Card Fraud Detection

An√°lisis exploratorio del dataset de fraude en tarjetas de cr√©dito.

## 1. Cargar Librer√≠as

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configurar estilo
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('‚úÖ Librer√≠as cargadas')

## 2. Cargar Dataset

In [None]:
# Cargar datos
df = pd.read_csv('../data/raw/creditcard.csv')

print(f'‚úÖ Dataset cargado')
print(f'Shape: {df.shape}')
print(f'\nColumnas: {list(df.columns)}')
print(f'\nTipos de datos:')
print(df.dtypes)

## 3. Primeras Filas

In [None]:
df.head(10)

## 4. Informaci√≥n General

In [None]:
df.info()

## 5. Estad√≠sticas B√°sicas

In [None]:
df.describe()

## 6. Missing Values

In [None]:
missing = df.isnull().sum()
print(f'Total missing values: {missing.sum()}')
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print('‚úÖ No hay missing values')

## 7. Distribuci√≥n de Clases (DESBALANCE)

In [None]:
# Contar
class_counts = df['Class'].value_counts()
print('Distribuci√≥n de clases:')
print(class_counts)
print(f'\nPorcentaje de fraudes: {class_counts[1] / len(df) * 100:.3f}%')
print(f'Ratio: 1 fraude por cada {int(class_counts[0] / class_counts[1])} leg√≠timas')

# Visualizar
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar plot
class_counts.plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Conteo de Clases')
axes[0].set_ylabel('Cantidad')
axes[0].set_xticklabels(['Leg√≠timo', 'Fraude'], rotation=0)

# Pie chart
class_counts.plot(kind='pie', ax=axes[1], labels=['Leg√≠timo', 'Fraude'], 
                  colors=['green', 'red'], autopct='%1.2f%%')
axes[1].set_title('Proporci√≥n de Clases')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('../reports/resources/images/01_class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('\n‚úÖ Gr√°fico guardado: class_distribution.png')

## 8. Exploraci√≥n de Amount

In [None]:
# Estad√≠sticas
print('Amount - Estad√≠sticas generales:')
print(df['Amount'].describe())

# Por clase
print('\nAmount - Leg√≠timas:')
print(df[df['Class']==0]['Amount'].describe())

print('\nAmount - Fraudes:')
print(df[df['Class']==1]['Amount'].describe())

# Visualizar
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Histograma
axes[0].hist(df[df['Class']==0]['Amount'], bins=50, label='Leg√≠timo', alpha=0.7, color='green')
axes[0].hist(df[df['Class']==1]['Amount'], bins=50, label='Fraude', alpha=0.7, color='red')
axes[0].set_xlabel('Amount (USD)')
axes[0].set_ylabel('Frecuencia')
axes[0].set_title('Distribuci√≥n de Amount')
axes[0].legend()

# Box plot
df.boxplot(column='Amount', by='Class', ax=axes[1]
axes[1].set_xlabel('Clase')
axes[1].set_ylabel('Amount (USD)')
axes[1].set_title('Amount por Clase')
axes[1].set_xticklabels(['Leg√≠timo', 'Fraude'])

plt.tight_layout()
plt.savefig('../reports/resources/images/02_amount_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('\n‚úÖ Gr√°fico guardado: amount_distribution.png')

## 9. Exploraci√≥n de Time

In [None]:
# Estad√≠sticas
print('Time - Estad√≠sticas:')
print(df['Time'].describe())

# Convertir a horas
time_hours = df['Time'] / 3600
print(f'\nRango temporal: {time_hours.min():.1f} a {time_hours.max():.1f} horas')
print(f'Duraci√≥n total: {time_hours.max() - time_hours.min():.1f} horas ‚âà {(time_hours.max() - time_hours.min()) / 24:.1f} d√≠as')

# Visualizar
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Histograma
axes[0].hist(df['Time'], bins=50, color='blue', alpha=0.7)
axes[0].set_xlabel('Time (segundos)')
axes[0].set_ylabel('Frecuencia')
axes[0].set_title('Distribuci√≥n de Time')

# Line plot
df_sorted = df.sort_values('Time')
axes[1].plot(df_sorted['Time'] / 3600, df_sorted['Class'], alpha=0.3, markersize=2)
axes[1].set_xlabel('Time (horas)')
axes[1].set_ylabel('Clase (0=Leg√≠timo, 1=Fraude)')
axes[1].set_title('Fraudes por Tiempo')

plt.tight_layout()
plt.savefig('../reports/resources/images/03_time_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('\n‚úÖ Gr√°fico guardado: time_distribution.png')

## 10. Correlaci√≥n con Target

In [None]:
# Calcular correlaci√≥n
corr_with_target = df.corr()['Class'].sort_values(ascending=False)

print('Top 10 features m√°s correlacionadas con fraude:')
print(corr_with_target.head(11))  # Top 10 + la misma Class

print('\nTop 10 features NEGATIVAS correlacionadas:')
print(corr_with_target.tail(10))

## 11. Heatmap de Correlaci√≥n

In [None]:
# Full correlation matrix
plt.figure(figsize=(14, 12))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm', center=0, cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('../reports/resources/images/04_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print('‚úÖ Gr√°fico guardado: correlation_heatmap.png')

## 12. Feature Importance (por Correlaci√≥n)

In [None]:
# Top features
top_features = corr_with_target[1:11]  # Top 10 (excluyendo Class)

plt.figure(figsize=(10, 6))
top_features.sort_values().plot(kind='barh', color=['red' if x > 0 else 'blue' for x in top_features.sort_values().values])
plt.xlabel('Correlaci√≥n con Fraude')
plt.title('Top 10 Features Correlacionadas con Fraude')
plt.tight_layout()
plt.savefig('../reports/resources/images/05_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print('‚úÖ Gr√°fico guardado: feature_importance.png')

## 13. V1-V28: Box Plots por Clase (Muestra)

In [None]:
# Seleccionar top 6 features por correlaci√≥n
top_6_features = corr_with_target[1:7].index.tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, feature in enumerate(top_6_features):
    df.boxplot(column=feature, by='Class', ax=axes[idx])
    axes[idx].set_xlabel('Clase')
    axes[idx].set_ylabel(feature)
    axes[idx].set_title(f'{feature} por Clase')
    axes[idx].set_xticklabels(['Leg√≠timo', 'Fraude'])

plt.suptitle('')
plt.tight_layout()
plt.savefig('../reports/resources/images/06_top_features_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

print('‚úÖ Gr√°fico guardado: top_features_boxplots.png')

## 14. Duplicados

In [None]:
duplicados = df.duplicated().sum()
print(f'Filas duplicadas: {duplicados}')
if duplicados > 0:
    print(f'Porcentaje: {duplicados / len(df) * 100:.3f}%')
else:
    print('‚úÖ No hay duplicados')

## 15. Outliers en Amount

In [None]:
# Detectar outliers (IQR)
Q1 = df['Amount'].quantile(0.25)
Q3 = df['Amount'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['Amount'] < Q1 - 1.5*IQR) | (df['Amount'] > Q3 + 1.5*IQR)]

print(f'Amount Outliers (IQR method):')
print(f'Total: {len(outliers)}')
print(f'Porcentaje: {len(outliers) / len(df) * 100:.3f}%')
print(f'Fraudes en outliers: {outliers[outliers["Class"]==1].shape[0]} ({outliers[outliers["Class"]==1].shape[0] / outliers.shape[0] * 100:.1f}%)')
print(f'\nL√≠mites:')
print(f'Inferior: {Q1 - 1.5*IQR:.2f}')
print(f'Superior: {Q3 + 1.5*IQR:.2f}')

## 16. RESUMEN DE HALLAZGOS

In [None]:
print("""\n
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

üìä RESUMEN DE HALLAZGOS - EDA

‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

1. DESBALANCE SEVERO ‚úÖ
   ‚îî‚îÄ Fraudes: 492 (0.172%)
   ‚îî‚îÄ Leg√≠timas: 284,315 (99.828%)
   ‚îî‚îÄ Ratio: 1 fraude por cada 578 leg√≠timas
   ‚îî‚îÄ ACCI√ìN: Usar class_weight en modelos + SMOTE si necesario

2. NO HAY MISSING VALUES ‚úÖ
   ‚îî‚îÄ Dataset est√° limpio
   ‚îî‚îÄ Ninguna transformaci√≥n necesaria por NaN

3. AMOUNT ES DIFERENTE ENTRE CLASES ‚úÖ
   ‚îî‚îÄ Leg√≠timas: Mean=88.35, Median=22.00
   ‚îî‚îÄ Fraudes: Mean=122.21, Median=76.29
   ‚îî‚îÄ Fraudes tienden a ser montos mayores
   ‚îî‚îÄ ACCI√ìN: Log transform de Amount podr√≠a mejorar

4. TIME: 2 D√çAS DE DATOS ‚úÖ
   ‚îî‚îÄ Rango: 0 a 172,792 segundos (‚âà48 horas)
   ‚îî‚îÄ Sin patr√≥n temporal claro
   ‚îî‚îÄ ACCI√ìN: Usar como est√°, sin features derivadas

5. FEATURES PCA (V1-V28) ‚úÖ
   ‚îî‚îÄ Anonimizadas, sin interpretaci√≥n directa
   ‚îî‚îÄ Est√°n correlacionadas con fraude
   ‚îî‚îÄ Distribuciones diferentes entre clases
   ‚îî‚îÄ ACCI√ìN: Usar todas las features

6. CORRELACI√ìN CON TARGET
   ‚îî‚îÄ V14: -0.382 (negativa m√°s fuerte)
   ‚îî‚îÄ V10: -0.340
   ‚îî‚îÄ V12: -0.319
   ‚îî‚îÄ V7: -0.312
   ‚îî‚îÄ Amount: 0.029 (d√©bil pero presente)
   ‚îî‚îÄ Time: -0.012 (casi nula)

7. OUTLIERS EN AMOUNT ‚úÖ
   ‚îî‚îÄ Detectados: 1,903 (0.67%)
   ‚îî‚îÄ 81 fraudes en outliers (16.5% del total de fraudes)
   ‚îî‚îÄ ACCI√ìN: Mantener outliers (no son errores)

8. NO HAY DUPLICADOS ‚úÖ
   ‚îî‚îÄ Cada transacci√≥n es √∫nica
   ‚îî‚îÄ Sin problemas de data integrity

‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

‚úÖ DECISIONES PARA FASE 3 (MODELING):

   ‚Ä¢ Features: Usar todas (V1-V28 + Amount + Time)
   ‚Ä¢ Scaling: StandardScaler
   ‚Ä¢ Feature Engineering: 
     - Log(Amount + 1) para normalizar distribuci√≥n
     - Posibles: Hour of day, Day of week (si hay tiempo)
   ‚Ä¢ Desbalance Strategy: 
     - Primary: class_weight = {0: 1, 1: 578}
     - Secondary: SMOTE si no funciona
   ‚Ä¢ Split: 70-15-15, stratified
   ‚Ä¢ Modelos: LR, RF, XGB, NN
   ‚Ä¢ M√©trica primaria: ROC-AUC > 0.95

‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
""")

## 17. Guardar Resumen en Markdown

In [None]:
# Crear archivo de resumen
resumen = f"""# EDA Summary - Credit Card Fraud Detection

## Dataset Overview
- Shape: {df.shape}
- Registros: {len(df):,}
- Features: {df.shape[1]}
- Missing values: {df.isnull().sum().sum()}
- Duplicados: {df.duplicated().sum()}

## Target Distribution
- Leg√≠timas: {(df['Class']==0).sum():,} (99.828%)
- Fraudes: {(df['Class']==1).sum():,} (0.172%)
- Ratio: 1 fraude por cada {int((df['Class']==0).sum() / (df['Class']==1).sum())} leg√≠timas

## Amount Statistics
- Mean (Leg√≠timas): ${df[df['Class']==0]['Amount'].mean():.2f}
- Mean (Fraudes): ${df[df['Class']==1]['Amount'].mean():.2f}
- Max: ${df['Amount'].max():.2f}
- Min: ${df['Amount'].min():.2f}

## Top 10 Features Correlated with Fraud
{corr_with_target[1:11].to_string()}

## Key Findings
1. Severo desbalance de clases (0.17% fraudes)
2. Sin missing values
3. Amount es diferente entre clases
4. Features PCA correlacionadas con fraude
5. No hay patr√≥n temporal claro
6. No hay duplicados

## Decisiones para Modeling
- Usar todas las 31 features
- Class weights en modelos
- Log transform de Amount
- Split 70-15-15 stratified
- ROC-AUC > 0.95 como target
"""

# Guardar
with open('../reports/eda_summary.md', 'w') as f:
    f.write(resumen)

print('‚úÖ Resumen guardado en: reports/eda_summary.md')

## 18. LISTO PARA SIGUIENTE FASE

In [None]:
print("""\n‚úÖ EDA COMPLETADO\n
Artefactos creados:

üìÅ Gr√°ficos en reports/resources/images/:
  ‚úì 01_class_distribution.png
  ‚úì 02_amount_distribution.png
  ‚úì 03_time_distribution.png
  ‚úì 04_correlation_heatmap.png
  ‚úì 05_feature_importance.png
  ‚úì 06_top_features_boxplots.png

üìÑ Documentaci√≥n:
  ‚úì reports/eda_summary.md

üéØ SIGUIENTE: notebooks/02_modeling.ipynb
""")