In [31]:
import pandas as pd
from py2neo import Graph, Node, Relationship

print("Libraries imported successfully.")

Libraries imported successfully.


 Connect to Neo4j Database

In [32]:
# IMPORTANT: Make sure your Neo4j database is running before executing this cell.
# You can start it with: docker-compose up -d
try:
    graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
    # Test the connection
    graph.run("RETURN 1")
    print("Successfully connected to Neo4j database.")
except Exception as e:
    print(f"Failed to connect to Neo4j. Please ensure the database is running.")
    print(f"Error: {e}")
    graph = None


Successfully connected to Neo4j database.


Load and Sample Data for Ingestion

In [33]:
if graph is not None:
    try:
        df = pd.read_csv('../data/raw/PS_20174392719_1491204439457_log.csv')
        # Filter for transaction types that are part of the graph
        df_graph = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])].copy()
        
        # Take a smaller sample for faster ingestion in this notebook
        # We'll use fraudulent transactions and some legitimate ones for context
        df_fraud = df_graph[df_graph['isFraud'] == 1]
        df_normal_sample = df_graph[df_graph['isFraud'] == 0].sample(n=20000, random_state=42)
        df_sample = pd.concat([df_fraud, df_normal_sample]).reset_index(drop=True)

        print("Dataset loaded and sampled.")
        print(f"Sampled dataset shape: {df_sample.shape}")
    except FileNotFoundError:
        print("Error: Dataset file not found.")
        df_sample = None


Dataset loaded and sampled.
Sampled dataset shape: (28213, 11)


Clear Existing Graph Data (Optional)

In [34]:
if graph is not None:
    print("Clearing any existing data from the graph...")
    graph.run("MATCH (n) DETACH DELETE n")
    print("Graph cleared.")


Clearing any existing data from the graph...
Graph cleared.


Create Constraints for Performance

In [35]:
if graph is not None:
    print("Creating unique constraint on Account ID...")
    try:
        graph.run("CREATE CONSTRAINT ON (a:Account) ASSERT a.id IS UNIQUE")
        print("Constraint created successfully.")
    except Exception as e:
        print(f"Constraint might already exist: {e}")

Creating unique constraint on Account ID...
Constraint created successfully.


Ingest Data into Neo4j

In [None]:
if graph is not None and df_sample is not None:
    print("Starting data ingestion into Neo4j...")
    # Using a transaction for atomicity
    tx = graph.begin()
    
    for index, row in df_sample.iterrows():
        sender_id = row['nameOrig']
        receiver_id = row['nameDest']
        
        # Using MERGE to avoid creating duplicate nodes
        sender_node = Node("Account", id=sender_id)
        tx.merge(sender_node, "Account", "id")
        
        receiver_node = Node("Account", id=receiver_id)
        tx.merge(receiver_node, "Account", "id")
        
        # Create the relationship representing the transaction
        transaction_rel = Relationship(
            sender_node, 
            row['type'], # Relationship type is 'CASH_OUT' or 'TRANSFER'
            receiver_node,
            amount=row['amount'],
            step=row['step'],
            isFraud=bool(row['isFraud'])
        )
        tx.create(transaction_rel)
        
    tx.commit()
    print(f"Successfully ingested {len(df_sample)} transactions into Neo4j.")


Starting data ingestion into Neo4j...


Basic Graph Queries for Verification

In [None]:
if graph is not None:
    # Count nodes
    node_count = graph.run("MATCH (n:Account) RETURN count(n) AS count").data()[0]['count']
    print(f"\nTotal number of Account nodes in the graph: {node_count}")

    # Count relationships
    rel_count = graph.run("MATCH ()-[r]->() RETURN count(r) AS count").data()[0]['count']
    print(f"Total number of relationships (transactions) in the graph: {rel_count}")


 Pattern Detection - Finding Circular Transactions (Cycles)

In [None]:
if graph is not None:
    print("\nSearching for circular transaction patterns (money laundering cycles)...")
    
    # This query finds paths of length 3 that start and end at the same account
    cycle_query = """
    MATCH p=(a:Account)-[*3..5]->(a)
    RETURN p, length(p) as pathLength
    LIMIT 10
    """
    
    results = graph.run(cycle_query).data()
    
    if not results:
        print("No circular transaction paths of length 3-5 found in the sampled data.")
    else:
        print(f"Found {len(results)} potential money laundering cycles:")
        for record in results:
            path = record['p']
            print(f"  - Cycle of length {record['pathLength']} involving account {path.start_node['id']}")


Pattern Detection - Identifying "Smurfing" (Fan-Out)

In [None]:
if graph is not None:
    print("\nSearching for 'fan-out' patterns (one account sending to many)...")
    
    # This query finds accounts that have sent money to more than 10 other accounts
    fan_out_query = """
    MATCH (a:Account)-[r:TRANSFER]->(b:Account)
    WITH a, count(r) AS transactions
    WHERE transactions > 10
    RETURN a.id AS accountId, transactions
    ORDER BY transactions DESC
    LIMIT 10
    """
    results = graph.run(fan_out_query).data()
    
    if not results:
        print("No significant fan-out patterns found.")
    else:
        print(f"Found {len(results)} accounts with potential 'smurfing' activity:")
        for record in results:
            print(f"  - Account {record['accountId']} made {record['transactions']} outgoing transfers.")
