# Transaction Data Analysis

This notebook analyzes transaction patterns and creates a graph structure for money laundering detection.

In [16]:
!pip install polars
!pip install networkx



## Setup

Install required library.

In [17]:
import polars as pl
import networkx as nx

# df = pl.read_csv('data/HI-Small_Trans.csv')
# Try lazy frame
df = pl.read_csv('data/HI-Small_Trans.csv')

# Sample only 10% of full data for memory management, commented out if want full data
df = df.sample(fraction=0.10, with_replacement=False, seed=42).lazy()

G = nx.DiGraph()
SG = nx.DiGraph()

## Load Data

Read transaction data from CSV file.

In [18]:
df.collect()

Timestamp,From Bank,Account,To Bank,Account_duplicated_0,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
str,i64,str,i64,str,f64,str,f64,str,str,i64
"""2022/09/01 14:51""",122332,"""808376670""",220504,"""808763050""",165.58,"""UK Pound""",165.58,"""UK Pound""","""Cash""",0
"""2022/09/02 12:05""",70,"""100428660""",2843,"""800C08F10""",1000.0,"""US Dollar""",1000.0,"""US Dollar""","""Cheque""",0
"""2022/09/07 22:23""",14,"""8035A5A50""",3,"""8035CB140""",4449.0,"""Yuan""",4449.0,"""Yuan""","""Credit Card""",0
"""2022/09/02 15:44""",14290,"""8028B20E0""",214100,"""80B5A8EF0""",24022.22,"""Euro""",24022.22,"""Euro""","""ACH""",0
"""2022/09/01 00:28""",12735,"""803109B90""",226951,"""80A633740""",2777.01,"""US Dollar""",2777.01,"""US Dollar""","""Cheque""",0
…,…,…,…,…,…,…,…,…,…,…
"""2022/09/06 20:48""",124,"""813B87B71""",124,"""813B87B71""",0.026602,"""Bitcoin""",0.026602,"""Bitcoin""","""Bitcoin""",0
"""2022/09/02 01:05""",70,"""1004286A8""",111141,"""8144DAE90""",1614.02,"""Euro""",1614.02,"""Euro""","""Cash""",0
"""2022/09/09 12:46""",12381,"""8085123D0""",216645,"""80F2F2D50""",320.56,"""US Dollar""",320.56,"""US Dollar""","""Credit Card""",0
"""2022/09/08 16:02""",220,"""8006B0F10""",214615,"""805EE4920""",1648.87,"""US Dollar""",1648.87,"""US Dollar""","""Cheque""",0


In [19]:
df = df.with_columns(
    pl.col('Timestamp').str.strptime(pl.Datetime, format='%Y/%m/%d %H:%M')
)

## Data Preparation

Convert timestamp column to datetime format.

In [20]:
df.collect()

Timestamp,From Bank,Account,To Bank,Account_duplicated_0,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
datetime[μs],i64,str,i64,str,f64,str,f64,str,str,i64
2022-09-01 14:51:00,122332,"""808376670""",220504,"""808763050""",165.58,"""UK Pound""",165.58,"""UK Pound""","""Cash""",0
2022-09-02 12:05:00,70,"""100428660""",2843,"""800C08F10""",1000.0,"""US Dollar""",1000.0,"""US Dollar""","""Cheque""",0
2022-09-07 22:23:00,14,"""8035A5A50""",3,"""8035CB140""",4449.0,"""Yuan""",4449.0,"""Yuan""","""Credit Card""",0
2022-09-02 15:44:00,14290,"""8028B20E0""",214100,"""80B5A8EF0""",24022.22,"""Euro""",24022.22,"""Euro""","""ACH""",0
2022-09-01 00:28:00,12735,"""803109B90""",226951,"""80A633740""",2777.01,"""US Dollar""",2777.01,"""US Dollar""","""Cheque""",0
…,…,…,…,…,…,…,…,…,…,…
2022-09-06 20:48:00,124,"""813B87B71""",124,"""813B87B71""",0.026602,"""Bitcoin""",0.026602,"""Bitcoin""","""Bitcoin""",0
2022-09-02 01:05:00,70,"""1004286A8""",111141,"""8144DAE90""",1614.02,"""Euro""",1614.02,"""Euro""","""Cash""",0
2022-09-09 12:46:00,12381,"""8085123D0""",216645,"""80F2F2D50""",320.56,"""US Dollar""",320.56,"""US Dollar""","""Credit Card""",0
2022-09-08 16:02:00,220,"""8006B0F10""",214615,"""805EE4920""",1648.87,"""US Dollar""",1648.87,"""US Dollar""","""Cheque""",0


Disregard transaction with Payment Format "Reinvesment"

In [21]:
df = df.filter(pl.col("Payment Format") != "Reinvestment")

In [22]:
df.collect()

Timestamp,From Bank,Account,To Bank,Account_duplicated_0,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
datetime[μs],i64,str,i64,str,f64,str,f64,str,str,i64
2022-09-01 14:51:00,122332,"""808376670""",220504,"""808763050""",165.58,"""UK Pound""",165.58,"""UK Pound""","""Cash""",0
2022-09-02 12:05:00,70,"""100428660""",2843,"""800C08F10""",1000.0,"""US Dollar""",1000.0,"""US Dollar""","""Cheque""",0
2022-09-07 22:23:00,14,"""8035A5A50""",3,"""8035CB140""",4449.0,"""Yuan""",4449.0,"""Yuan""","""Credit Card""",0
2022-09-02 15:44:00,14290,"""8028B20E0""",214100,"""80B5A8EF0""",24022.22,"""Euro""",24022.22,"""Euro""","""ACH""",0
2022-09-01 00:28:00,12735,"""803109B90""",226951,"""80A633740""",2777.01,"""US Dollar""",2777.01,"""US Dollar""","""Cheque""",0
…,…,…,…,…,…,…,…,…,…,…
2022-09-06 20:48:00,124,"""813B87B71""",124,"""813B87B71""",0.026602,"""Bitcoin""",0.026602,"""Bitcoin""","""Bitcoin""",0
2022-09-02 01:05:00,70,"""1004286A8""",111141,"""8144DAE90""",1614.02,"""Euro""",1614.02,"""Euro""","""Cash""",0
2022-09-09 12:46:00,12381,"""8085123D0""",216645,"""80F2F2D50""",320.56,"""US Dollar""",320.56,"""US Dollar""","""Credit Card""",0
2022-09-08 16:02:00,220,"""8006B0F10""",214615,"""805EE4920""",1648.87,"""US Dollar""",1648.87,"""US Dollar""","""Cheque""",0


## Temporal View Graph Initialization

### Create Nodes

Build graph nodes from transactions with ID, sender, receiver, time, amount, and label.

In [23]:
nodes = df.with_row_index("node_id").select([
    pl.col("node_id"),
    pl.col("Account").alias("f_i"),                # From
    pl.col("Account_duplicated_0").alias("b_i"),   # Beneficiary
    pl.col("Timestamp").alias("t_i"),              # Time
    pl.col("Amount Received").alias("a_i"),        # Amount
    pl.col("Is Laundering")                        # Ground truth
])

In [24]:
nodes.collect().head()

node_id,f_i,b_i,t_i,a_i,Is Laundering
u32,str,str,datetime[μs],f64,i64
0,"""808376670""","""808763050""",2022-09-01 14:51:00,165.58,0
1,"""100428660""","""800C08F10""",2022-09-02 12:05:00,1000.0,0
2,"""8035A5A50""","""8035CB140""",2022-09-07 22:23:00,4449.0,0
3,"""8028B20E0""","""80B5A8EF0""",2022-09-02 15:44:00,24022.22,0
4,"""803109B90""","""80A633740""",2022-09-01 00:28:00,2777.01,0
…,…,…,…,…,…
459751,"""813B87B71""","""813B87B71""",2022-09-06 20:48:00,0.026602,0
459752,"""1004286A8""","""8144DAE90""",2022-09-02 01:05:00,1614.02,0
459753,"""8085123D0""","""80F2F2D50""",2022-09-09 12:46:00,320.56,0
459754,"""8006B0F10""","""805EE4920""",2022-09-08 16:02:00,1648.87,0


Adding nodes df into nx graph

In [25]:
# for row in nodes.collect().to_dicts():
#     G.add_node(row["node_id"], 
#                f_i=row["f_i"], 
#                b_i=row["b_i"], 
#                t_i=row["t_i"], 
#                a_i=row["a_i"], 
#                is_laundering=row["Is Laundering"])

In [26]:
# Sample some nodes to inspect
# sample_nodes = list(G.nodes)[:5]
# for node in sample_nodes:
#     print(f"Node {node}: {G.nodes[node]}")

Number of nodes in G: 459756
Node 0: {'f_i': '808376670', 'b_i': '808763050', 't_i': datetime.datetime(2022, 9, 1, 14, 51), 'a_i': 165.58, 'is_laundering': 0}
Node 1: {'f_i': '100428660', 'b_i': '800C08F10', 't_i': datetime.datetime(2022, 9, 2, 12, 5), 'a_i': 1000.0, 'is_laundering': 0}
Node 2: {'f_i': '8035A5A50', 'b_i': '8035CB140', 't_i': datetime.datetime(2022, 9, 7, 22, 23), 'a_i': 4449.0, 'is_laundering': 0}
Node 3: {'f_i': '8028B20E0', 'b_i': '80B5A8EF0', 't_i': datetime.datetime(2022, 9, 2, 15, 44), 'a_i': 24022.22, 'is_laundering': 0}
Node 4: {'f_i': '803109B90', 'b_i': '80A633740', 't_i': datetime.datetime(2022, 9, 1, 0, 28), 'a_i': 2777.01, 'is_laundering': 0}


### Create Edges

Connect transactions where one receiver becomes the sender in another transaction.

In [28]:
edges = nodes.join(
    nodes,
    left_on="b_i", 
    right_on="f_i",
    suffix="_d",
    how="inner"
).rename({"node_id": "v_s", "node_id_d": "v_d"})

In [29]:
edges.collect()

v_s,f_i,b_i,t_i,a_i,Is Laundering,v_d,b_i_d,t_i_d,a_i_d,Is Laundering_d
u32,str,str,datetime[μs],f64,i64,u32,str,datetime[μs],f64,i64
99295,"""100428810""","""808376670""",2022-09-10 12:26:00,94.14,0,0,"""808763050""",2022-09-01 14:51:00,165.58,0
225193,"""8076829F0""","""808376670""",2022-09-02 18:32:00,1537.16,0,0,"""808763050""",2022-09-01 14:51:00,165.58,0
280904,"""100428810""","""808376670""",2022-09-04 08:28:00,55.19,0,0,"""808763050""",2022-09-01 14:51:00,165.58,0
300146,"""100428810""","""808376670""",2022-09-02 01:56:00,55.19,0,0,"""808763050""",2022-09-01 14:51:00,165.58,0
13375,"""80571C290""","""100428660""",2022-09-09 06:35:00,529.2,0,1,"""800C08F10""",2022-09-02 12:05:00,1000.0,0
…,…,…,…,…,…,…,…,…,…,…
451074,"""805836AE0""","""1004286A8""",2022-09-09 16:27:00,3503.46,0,459752,"""8144DAE90""",2022-09-02 01:05:00,1614.02,0
454084,"""803BA4C60""","""1004286A8""",2022-09-02 13:23:00,1370.04,0,459752,"""8144DAE90""",2022-09-02 01:05:00,1614.02,0
69627,"""801F49CC0""","""8085123D0""",2022-09-08 00:56:00,1543.08,0,459753,"""80F2F2D50""",2022-09-09 12:46:00,320.56,0
213880,"""801F49CC0""","""8085123D0""",2022-09-06 04:06:00,1543.08,0,459753,"""80F2F2D50""",2022-09-09 12:46:00,320.56,0


In [30]:
timedelta = pl.duration(hours=24)

### Filter Edges

Keep only edges where the second transaction occurs within 24 hours after the first.

In [31]:
edges = edges.filter(
    (pl.col("t_i_d") > pl.col("t_i")) & 
    (pl.col("t_i_d") < pl.col("t_i") + timedelta)
)

adding edges to graph G

In [34]:
# for row in edges.select(["v_s", "v_d", "t_i", "t_i_d"]).collect().to_dicts():
#     print(row)
#     break

{'v_s': 15720, 'v_d': 1, 't_i': datetime.datetime(2022, 9, 2, 9, 51), 't_i_d': datetime.datetime(2022, 9, 2, 12, 5)}


### Temporal View Results

Display final nodes and edges.

In [32]:
print(nodes.collect())

shape: (459_756, 6)
┌─────────┬───────────┬───────────┬─────────────────────┬──────────┬───────────────┐
│ node_id ┆ f_i       ┆ b_i       ┆ t_i                 ┆ a_i      ┆ Is Laundering │
│ ---     ┆ ---       ┆ ---       ┆ ---                 ┆ ---      ┆ ---           │
│ u32     ┆ str       ┆ str       ┆ datetime[μs]        ┆ f64      ┆ i64           │
╞═════════╪═══════════╪═══════════╪═════════════════════╪══════════╪═══════════════╡
│ 0       ┆ 808376670 ┆ 808763050 ┆ 2022-09-01 14:51:00 ┆ 165.58   ┆ 0             │
│ 1       ┆ 100428660 ┆ 800C08F10 ┆ 2022-09-02 12:05:00 ┆ 1000.0   ┆ 0             │
│ 2       ┆ 8035A5A50 ┆ 8035CB140 ┆ 2022-09-07 22:23:00 ┆ 4449.0   ┆ 0             │
│ 3       ┆ 8028B20E0 ┆ 80B5A8EF0 ┆ 2022-09-02 15:44:00 ┆ 24022.22 ┆ 0             │
│ 4       ┆ 803109B90 ┆ 80A633740 ┆ 2022-09-01 00:28:00 ┆ 2777.01  ┆ 0             │
│ …       ┆ …         ┆ …         ┆ …                   ┆ …        ┆ …             │
│ 459751  ┆ 813B87B71 ┆ 813B87B71 ┆ 2022-09-0

In [33]:
print(edges.select(["v_s", "v_d", "t_i", "t_i_d"]).collect())

shape: (350_964, 4)
┌────────┬────────┬─────────────────────┬─────────────────────┐
│ v_s    ┆ v_d    ┆ t_i                 ┆ t_i_d               │
│ ---    ┆ ---    ┆ ---                 ┆ ---                 │
│ u32    ┆ u32    ┆ datetime[μs]        ┆ datetime[μs]        │
╞════════╪════════╪═════════════════════╪═════════════════════╡
│ 15720  ┆ 1      ┆ 2022-09-02 09:51:00 ┆ 2022-09-02 12:05:00 │
│ 17905  ┆ 1      ┆ 2022-09-02 10:00:00 ┆ 2022-09-02 12:05:00 │
│ 50608  ┆ 1      ┆ 2022-09-02 07:18:00 ┆ 2022-09-02 12:05:00 │
│ 77968  ┆ 1      ┆ 2022-09-02 07:35:00 ┆ 2022-09-02 12:05:00 │
│ 78253  ┆ 1      ┆ 2022-09-02 02:02:00 ┆ 2022-09-02 12:05:00 │
│ …      ┆ …      ┆ …                   ┆ …                   │
│ 231812 ┆ 459751 ┆ 2022-09-06 02:34:00 ┆ 2022-09-06 20:48:00 │
│ 217176 ┆ 459752 ┆ 2022-09-02 00:29:00 ┆ 2022-09-02 01:05:00 │
│ 259236 ┆ 459752 ┆ 2022-09-02 00:08:00 ┆ 2022-09-02 01:05:00 │
│ 348774 ┆ 459752 ┆ 2022-09-02 00:44:00 ┆ 2022-09-02 01:05:00 │
│ 424825 ┆ 459752 ┆ 

## Second Order Graph Creation

### Node creation

create a second order node from Temporal graph's edges

In [45]:
s_nodes = edges.with_columns([
    pl.concat_str([pl.col("v_s"), pl.col("v_d")], separator="->").alias("node_id"),
    pl.col("v_s").alias("f_i"),
    pl.col("v_d").alias("b_i"),
]).select(["node_id", "f_i", "b_i"])

In [46]:
s_nodes.collect().head()

node_id,f_i,b_i
str,u32,u32
"""15720->1""",15720,1
"""17905->1""",17905,1
"""50608->1""",50608,1
"""77968->1""",77968,1
"""78253->1""",78253,1


### Edge creation

In [None]:
s_edges = s_nodes.join(
    s_nodes,
    left_on="b_i", 
    right_on="f_i",
    suffix="_d",
    how="inner"
).rename({"node_id": "v_s", "node_id_d": "v_d"})

In [63]:
s_edges.collect().head()

v_s,f_i,b_i,v_d,b_i_d
str,u32,u32,str,u32
"""81923->404989""",81923,404989,"""404989->60""",60
"""433814->404989""",433814,404989,"""404989->60""",60
"""372745->134996""",372745,134996,"""134996->62""",62
"""90193->347236""",90193,347236,"""347236->64""",64
"""417336->19969""",417336,19969,"""19969->66""",66


### Weight Calculation

In [66]:
# Count times where one node goes to another
s_edges_with_count = s_edges.group_by(["v_s", "v_d"]).agg([
    pl.len().alias("count"),
    pl.col("f_i").first(),
    pl.col("b_i").first(),
    pl.col("b_i_d").first()
])

In [68]:
s_edges_with_count.collect().head()

v_s,v_d,count,f_i,b_i,b_i_d
str,str,u32,u32,u32,u32
"""97659->2217""","""2217->53749""",1,97659,2217,53749
"""17905->225638""","""225638->163188""",1,17905,225638,163188
"""424772->23561""","""23561->66422""",1,424772,23561,66422
"""151756->106042""","""106042->86330""",1,151756,106042,86330
"""298532->395575""","""395575->105378""",1,298532,395575,105378


In [None]:
# s_edges_with_weight = s_edges.group_by(["v_s", "v_d"]).agg([