# Exploratory Data Analysis — Fraud Detection

Quick look at the synthetic transaction data to understand distributions and class balance before building the GNN.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.dataset import create_synthetic_fraud_data

sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Generate synthetic data
df = create_synthetic_fraud_data(
    num_users=1000,
    num_merchants=200,
    num_transactions=10000,
    fraud_rate=0.05,
)

print(f'Shape: {df.shape}')
df.head()

## Class Distribution

5% fraud rate — pretty imbalanced, will need focal loss or class weighting.

In [None]:
print(df['is_fraud'].value_counts())
print(f'\nFraud rate: {df["is_fraud"].mean():.2%}')

fig, ax = plt.subplots(figsize=(6, 4))
df['is_fraud'].value_counts().plot(kind='bar', ax=ax, color=['steelblue', 'indianred'])
ax.set_xlabel('Is Fraud')
ax.set_ylabel('Count')
ax.set_title('Class Distribution')
plt.tight_layout()

## Amount Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Overall distribution
axes[0].hist(df['amount'], bins=50, color='steelblue', alpha=0.7)
axes[0].set_xlabel('Amount')
axes[0].set_title('Transaction Amount Distribution')

# Log amount by fraud label
for label, group in df.groupby('is_fraud'):
    color = 'indianred' if label == 1 else 'steelblue'
    name = 'Fraud' if label == 1 else 'Legit'
    axes[1].hist(np.log1p(group['amount']), bins=40, alpha=0.6, label=name, color=color)

axes[1].set_xlabel('Log Amount')
axes[1].set_title('Log Amount by Class')
axes[1].legend()

plt.tight_layout()

Fraud transactions have higher amounts on average (by design in the synthetic data, but this pattern is realistic).

## Temporal Patterns

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour

fig, ax = plt.subplots(figsize=(10, 4))

fraud_hours = df[df['is_fraud'] == 1]['hour'].value_counts().sort_index()
legit_hours = df[df['is_fraud'] == 0]['hour'].value_counts().sort_index()

# Normalize to show proportions
fraud_pct = fraud_hours / fraud_hours.sum()
legit_pct = legit_hours / legit_hours.sum()

ax.plot(fraud_pct.index, fraud_pct.values, 'o-', color='indianred', label='Fraud')
ax.plot(legit_pct.index, legit_pct.values, 'o-', color='steelblue', label='Legit')

ax.set_xlabel('Hour of Day')
ax.set_ylabel('Proportion')
ax.set_title('Transaction Distribution by Hour')
ax.legend()
plt.tight_layout()

Fraud is concentrated in nighttime hours (0-6am) — makes sense and should be a strong signal for the model.

## Graph Statistics

In [None]:
from src.data.graph_builder import TransactionGraphBuilder

builder = TransactionGraphBuilder()
graph = builder.build_graph(df)

print(f'Nodes: {graph.num_nodes}')
print(f'Edges: {graph.edge_index.shape[1]}')
print(f'Node features: {graph.x.shape[1]}')
print(f'Edge features: {graph.edge_attr.shape[1]}')
print(f'\nUsers: {graph.num_users}')
print(f'Merchants: {graph.num_merchants}')
print(f'Avg degree: {graph.edge_index.shape[1] / graph.num_nodes:.1f}')

## User Transaction Frequency

In [None]:
user_counts = df['user_id'].value_counts()

fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(user_counts.values, bins=30, color='steelblue', alpha=0.7)
ax.set_xlabel('Number of Transactions')
ax.set_ylabel('Number of Users')
ax.set_title('User Activity Distribution')
ax.axvline(user_counts.mean(), color='red', linestyle='--', label=f'Mean: {user_counts.mean():.1f}')
ax.legend()
plt.tight_layout()

## Next Steps

- Build the GNN models (GraphSAGE first, then GAT/GIN)
- Handle class imbalance with focal loss
- Try heterogeneous graph with device nodes
- Add GNNExplainer for interpretability