In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import os  # Added for directory safety

# Load cleaned data
df = pd.read_csv('data/processed/clean.csv')

# Ensure data/ dir (for eda_summary.json)
os.makedirs('data', exist_ok=True)

# Ensure 'eda_charts' dir (in data/ for consistency, but original was data/eda_charts)
Path('data/eda_charts').mkdir(parents=True, exist_ok=True)  # parents=True creates data/ if needed

# Plot class imbalance
plt.figure(figsize=(6, 4))
df['Class'].value_counts().plot(kind='bar')
plt.title('Class Imbalance')
plt.ylabel('Count')
plt.savefig('data/eda_charts/class_imbalance.png')
plt.close()

# Distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
df['Amount'].hist(ax=axes[0,0], bins=50)
axes[0,0].set_title('Transaction Amount Distribution')
df[df['Class']==1]['Amount'].hist(ax=axes[0,1], bins=20, color='red')
axes[0,1].set_title('Fraud Amount Distribution')
df['Time'].hist(ax=axes[1,0], bins=50)
axes[1,0].set_title('Transaction Time Distribution')
# Sample scatter for PCA (V1 vs V2 as proxy)
df.plot.scatter(x='V1', y='V2', c='Class', colormap='coolwarm', ax=axes[1,1], alpha=0.5)
axes[1,1].set_title('PCA Components V1 vs V2')
plt.tight_layout()
plt.savefig('data/eda_charts/distributions.png')
plt.close()

# Fraud by Time (hourly proxy: Time/3600 % 24)
df['Hour'] = (df['Time'] / 3600) % 24
fraud_by_hour = df.groupby('Hour')['Class'].mean()
fraud_by_hour.plot(ax=plt.gca())
plt.title('Fraud Rate by Hour')
plt.savefig('data/eda_charts/fraud_by_time.png')
plt.close()

# Correlation heatmap (sample on PCA + Amount/Time/Class; subsample for speed if needed)
corr_cols = ['Time', 'Amount']

TypeError: Object of type int64 is not JSON serializable