# 01 - Data Loading and EDA

Purpose: load the raw credit card fraud dataset and perform initial exploratory analysis without any preprocessing. All randomness is controlled with `random_state=42`.

Key checks:
- Dataset shape and column types
- Missing values
- Class distribution (fraud vs non-fraud)
- Transaction amount and time analysis

**Note:** Do not modify data in this notebook to avoid leakage.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8')

RAW_PATH = Path('data/raw/credit_card_fraud_dataset.csv')
assert RAW_PATH.exists(), f"Missing dataset at {RAW_PATH}"

data = pd.read_csv(RAW_PATH)
data.head()

In [None]:
# Shape and dtypes
data.shape, data.dtypes

In [None]:
# Missing value summary
data.isna().sum().sort_values(ascending=False)

In [None]:
# Class distribution
class_counts = data['Class'].value_counts().rename(index={0: 'Non-Fraud', 1: 'Fraud'})
class_counts

In [None]:
# Plot: Fraud vs Non-Fraud counts
fig, ax = plt.subplots(figsize=(6,4))
sns.barplot(x=class_counts.index, y=class_counts.values, ax=ax)
ax.set_ylabel('Count')
ax.set_title('Class Distribution')
plt.show()

In [None]:
# Transaction Amount distribution by class
fig, ax = plt.subplots(figsize=(8,5))
sns.histplot(data=data, x='Amount', hue='Class', log_scale=(False, True), bins=50, ax=ax)
ax.set_title('Transaction Amount Distribution by Class')
plt.show()

In [None]:
# Transaction Time distribution
fig, ax = plt.subplots(figsize=(8,5))
sns.histplot(data=data, x='Time', hue='Class', bins=50, ax=ax)
ax.set_title('Transaction Time Distribution by Class')
plt.show()

Next: proceed to `02_data_preprocessing.ipynb` for scaling and dataset splitting (with strict separation to avoid leakage).