# Graph-Based Fraud Detection Demo

Educational example of fraud detection using graph analysis techniques.

Based on concepts from: [Neo4j Fraud Detection](https://neo4j.com/blog/developer/exploring-fraud-detection-neo4j-graph-data-science-summary/)

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

from src.data.generate_dataset import FraudDatasetGenerator
from src.models.graph_builder import FraudGraph
from src.models.fraud_detector import FraudDetector
from src.features.graph_rag import GraphRAG

## 1. Generate Synthetic Dataset

Create a toy P2P payment dataset with fraud patterns:
- Fraud rings (users sharing devices)
- Money mule patterns
- Suspicious transaction patterns

In [None]:
# Generate dataset
generator = FraudDatasetGenerator(seed=42)
dataset = generator.generate_dataset(n_users=200, n_transactions=1000)

print("Dataset generated:")
for name, df in dataset.items():
    print(f"  {name}: {len(df)} records")

In [None]:
# Examine data
print("\nUsers:")
print(dataset['users'].head())

print("\nTransactions:")
print(dataset['transactions'].head())

print("\nFraud Statistics:")
print(f"  Fraudsters: {dataset['users']['is_fraudster'].sum()}")
print(f"  Fraud rings: {len(dataset['fraud_rings'])}")
print(f"  Fraudulent transactions: {dataset['transactions']['is_fraudulent'].sum()}")

## 2. Build Graph Structure

Construct a graph from entities and relationships

In [None]:
# Build graph
fraud_graph = FraudGraph()
fraud_graph.build_from_dataset(dataset)

stats = fraud_graph.get_statistics()
print("Graph Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

In [None]:
# Visualize a subgraph
sample_user = dataset['users'].iloc[0]['user_id']
subgraph = fraud_graph.get_user_subgraph(sample_user, depth=2)

plt.figure(figsize=(12, 8))
pos = nx.spring_layout(subgraph, k=0.5, iterations=50)

# Color nodes by type
colors = []
for node in subgraph.nodes():
    node_type = subgraph.nodes[node].get('node_type')
    is_fraudster = subgraph.nodes[node].get('is_fraudster', False)
    
    if node_type == 'device':
        colors.append('lightblue')
    elif is_fraudster:
        colors.append('red')
    else:
        colors.append('lightgreen')

nx.draw(subgraph, pos, node_color=colors, with_labels=True, 
        node_size=500, font_size=8, arrows=True)
plt.title(f"Network around {sample_user}\n(Red=Fraudster, Green=Normal, Blue=Device)")
plt.show()

## 3. Fraud Detection Algorithms

Apply graph-based detection methods

In [None]:
# Initialize detector
detector = FraudDetector(fraud_graph)

# Generate fraud report
report = detector.generate_fraud_report(dataset['transactions'])

print("Fraud Detection Report:\n")
print(f"Communities detected: {len(set(report['communities'].values()))}")
print(f"High-risk users identified: {len(report['high_risk_users'])}")
print(f"Shared devices found: {len(report['shared_resources'])}")

In [None]:
# Centrality scores
centrality_df = report['centrality_scores'].sort_values('pagerank', ascending=False)
print("\nTop 10 by PageRank:")
print(centrality_df[['user_id', 'pagerank', 'is_fraudster']].head(10))

In [None]:
# Risk scores
risk_df = report['risk_scores']
print("\nTop 10 High-Risk Users:")
print(risk_df[['user_id', 'risk_score', 'device_risk', 'is_fraudster']].head(10))

In [None]:
# Shared devices analysis
print("\nShared Devices (Fraud Ring Indicator):")
for resource in report['shared_resources'][:5]:
    print(f"  {resource['device_id']}: shared by {resource['shared_by_count']} users, "
          f"{resource['fraudster_count']} fraudsters (risk: {resource['risk_score']:.2f})")

In [None]:
# Visualize risk score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Risk score by fraud status
risk_df.boxplot(column='risk_score', by='is_fraudster', ax=axes[0])
axes[0].set_title('Risk Score Distribution')
axes[0].set_xlabel('Is Fraudster')
axes[0].set_ylabel('Risk Score')

# PageRank distribution
centrality_df.boxplot(column='pagerank', by='is_fraudster', ax=axes[1])
axes[1].set_title('PageRank Distribution')
axes[1].set_xlabel('Is Fraudster')
axes[1].set_ylabel('PageRank')

plt.tight_layout()
plt.show()

## 4. Graph RAG - Query Interface

Interactive queries for fraud investigation

In [None]:
# Initialize Graph RAG
graph_rag = GraphRAG(fraud_graph, detector, dataset)

In [None]:
# Query user profile
user_id = report['high_risk_users'][0] if report['high_risk_users'] else 'U0000'
profile = graph_rag.query('user_profile', user_id=user_id)

print(f"User Profile: {user_id}")
for key, value in profile.items():
    print(f"  {key}: {value}")

In [None]:
# Query fraud risk
risk_info = graph_rag.query('fraud_risk', user_id=user_id)

print(f"\nFraud Risk Analysis: {user_id}")
for key, value in risk_info.items():
    print(f"  {key}: {value}")

In [None]:
# Query user connections
connections = graph_rag.query('user_connections', user_id=user_id, depth=2)

print(f"\nNetwork Connections: {user_id}")
for key, value in connections.items():
    if key != 'subgraph_nodes':
        print(f"  {key}: {value}")

In [None]:
# Query shared devices
shared_devices = graph_rag.query('shared_devices')

print("\nShared Device Analysis:")
print(f"  Total shared devices: {shared_devices['total_shared_devices']}")
print(f"  High-risk devices: {len(shared_devices['high_risk_devices'])}")

In [None]:
# Query suspicious patterns
patterns = graph_rag.query('suspicious_patterns', top_n=10)

print("\nSuspicious Patterns:")
print(f"\nDetection Performance:")
for metric, value in patterns['detection_accuracy'].items():
    print(f"  {metric}: {value}")

print(f"\nHigh-risk users identified: {len(patterns['high_risk_users'])}")
print(f"Device clusters: {len(patterns['shared_device_clusters'])}")

## 5. Community Analysis

Detect fraud rings through community detection

In [None]:
# Overall community stats
community_info = graph_rag.query('community_info')

print("Community Statistics:")
for key, value in community_info.items():
    print(f"  {key}: {value}")

In [None]:
# Specific community analysis
if report['high_risk_users']:
    user_community = graph_rag.query('community_info', user_id=report['high_risk_users'][0])
    
    print(f"\nCommunity Analysis for {report['high_risk_users'][0]}:")
    for key, value in user_community.items():
        if key != 'community_members':
            print(f"  {key}: {value}")

## Summary

This notebook demonstrates:
1. **Synthetic data generation** with realistic fraud patterns
2. **Graph construction** from transaction data
3. **Graph algorithms** for fraud detection:
   - Community detection (Louvain)
   - Centrality measures (PageRank, Betweenness)
   - Shared resource detection
4. **Graph RAG** for interactive fraud investigation

Key insights:
- Graph methods reveal hidden patterns in connected data
- Fraud rings can be detected through shared resources
- Centrality metrics identify key players in fraud networks
- Community detection helps isolate suspicious groups