# EDA â€” Credit Card Fraud Detection

This notebook performs basic exploratory data analysis on `data/creditcard.csv`: shape, missing values, class imbalance, `Amount` distribution, and a few visualizations.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

DATA_PATH = 'data/creditcard.csv'
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Basic info
print('shape:', df.shape)
print('
Missing values per column:')
print(df.isna().sum()[df.isna().sum()>0])

# Class distribution
print('
Class distribution:')
print(df['Class'].value_counts())
sns.countplot(x='Class', data=df);

In [None]:
# Amount distribution (log scale)
plt.figure(figsize=(10,4))
sns.histplot(df['Amount'], bins=100, kde=False)
plt.yscale('log')
plt.title('Transaction Amount distribution (log y)')
plt.show()

# Amount by class
plt.figure(figsize=(10,4))
sns.boxplot(x='Class', y='Amount', data=df)
plt.yscale('log')
plt.title('Transaction Amount by Class (log y)')
plt.show()

In [None]:
# Correlation heatmap for Amount and V1..V10 (sample to speed up)
cols = ['Amount'] + [f'V{i}' for i in range(1,11) if f'V{i}' in df.columns]
corr = df[cols].sample(n=min(20000, len(df)), random_state=42).corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation (sample)')
plt.show()

## Next steps
- Train models and record metrics (already implemented in `src/train.py`)
- Try oversampling (SMOTE) or class-weighted models and compare performance
- Explore feature importance from the trained RandomForest model