In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import networkx as nx

In [3]:
df = pd.read_csv("/kaggle/input/amlaaa/amlsim.csv")
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,Pattern Type
0,9/5/2022 0:51,4,8060AD3E0,17554,806BD22D0,117010.83,Ruble,117010.83,Ruble,Credit Card,0,Not Laundering
1,9/5/2022 0:37,70,1004287C8,217824,807125310,98271.67,Ruble,98271.67,Ruble,Cheque,1,Unknown
2,9/1/2022 0:04,120292,807DF0AE0,120292,807DF0AE0,3.44,UK Pound,3.44,UK Pound,Reinvestment,0,GATHER-SCATTER
3,9/10/2022 18:14,220,813741041,220,813741041,0.036028,Bitcoin,0.036028,Bitcoin,Bitcoin,0,Not Laundering
4,9/4/2022 13:58,70,100428660,11642,807BE8740,330.4,US Dollar,330.4,US Dollar,Credit Card,0,CYCLE


In [4]:
df.info

<bound method DataFrame.info of              Timestamp  From Bank    Account  To Bank  Account.1  \
0        9/5/2022 0:51          4  8060AD3E0    17554  806BD22D0   
1        9/5/2022 0:37         70  1004287C8   217824  807125310   
2        9/1/2022 0:04     120292  807DF0AE0   120292  807DF0AE0   
3      9/10/2022 18:14        220  813741041      220  813741041   
4       9/4/2022 13:58         70  100428660    11642  807BE8740   
...                ...        ...        ...      ...        ...   
25880   9/9/2022 17:00     310386  803D3E8C0     5836  803D35770   
25881    9/2/2022 4:31      21611  8019202C0   224449  80B29C020   
25882   9/2/2022 18:48      10057  803FEFF90        3  802A4BBD0   
25883   9/9/2022 16:58     316952  806281B30   116425  806281AE0   
25884   9/1/2022 17:48         70  100428660    14549  802010770   

       Amount Received Receiving Currency    Amount Paid Payment Currency  \
0        117010.830000              Ruble  117010.830000            Ruble 

In [5]:
df.columns = (
    df.columns
    .str.replace(" ", "", regex=False)
    .str.replace("\t", "", regex=False)
    .str.replace("\n", "", regex=False)
    .str.strip()
)
df.columns


Index(['Timestamp', 'FromBank', 'Account', 'ToBank', 'Account.1',
       'AmountReceived', 'ReceivingCurrency', 'AmountPaid', 'PaymentCurrency',
       'PaymentFormat', 'IsLaundering', 'PatternType'],
      dtype='object')

In [6]:
if 'FromBank' in df.columns and 'Account' in df.columns:
    df['FromBankAccount'] = df['FromBank'].astype(str) + df['Account'].astype(str)

if 'ToBank' in df.columns and 'Account.1' in df.columns:
    df['ToBankAccount'] = df['ToBank'].astype(str) + df['Account.1'].astype(str)

df.drop(columns=['FromBank', 'Account', 'ToBank', 'Account.1'], errors='ignore', inplace=True)
df.head()


Unnamed: 0,Timestamp,AmountReceived,ReceivingCurrency,AmountPaid,PaymentCurrency,PaymentFormat,IsLaundering,PatternType,FromBankAccount,ToBankAccount
0,9/5/2022 0:51,117010.83,Ruble,117010.83,Ruble,Credit Card,0,Not Laundering,48060AD3E0,17554806BD22D0
1,9/5/2022 0:37,98271.67,Ruble,98271.67,Ruble,Cheque,1,Unknown,701004287C8,217824807125310
2,9/1/2022 0:04,3.44,UK Pound,3.44,UK Pound,Reinvestment,0,GATHER-SCATTER,120292807DF0AE0,120292807DF0AE0
3,9/10/2022 18:14,0.036028,Bitcoin,0.036028,Bitcoin,Bitcoin,0,Not Laundering,220813741041,220813741041
4,9/4/2022 13:58,330.4,US Dollar,330.4,US Dollar,Credit Card,0,CYCLE,70100428660,11642807BE8740


In [7]:
if 'Timestamp' in df.columns:
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['Timestamp'] = df['Timestamp'].astype('int64') / 1e9
df.head()


Unnamed: 0,Timestamp,AmountReceived,ReceivingCurrency,AmountPaid,PaymentCurrency,PaymentFormat,IsLaundering,PatternType,FromBankAccount,ToBankAccount
0,1662339000.0,117010.83,Ruble,117010.83,Ruble,Credit Card,0,Not Laundering,48060AD3E0,17554806BD22D0
1,1662338000.0,98271.67,Ruble,98271.67,Ruble,Cheque,1,Unknown,701004287C8,217824807125310
2,1661991000.0,3.44,UK Pound,3.44,UK Pound,Reinvestment,0,GATHER-SCATTER,120292807DF0AE0,120292807DF0AE0
3,1662834000.0,0.036028,Bitcoin,0.036028,Bitcoin,Bitcoin,0,Not Laundering,220813741041,220813741041
4,1662300000.0,330.4,US Dollar,330.4,US Dollar,Credit Card,0,CYCLE,70100428660,11642807BE8740


In [8]:
df = df.fillna(0)


In [9]:
numeric_cols = ['AmountReceived', 'AmountPaid', 'Timestamp']
existing_numeric_cols = [col for col in numeric_cols if col in df.columns]

scaler = MinMaxScaler()
df[existing_numeric_cols] = scaler.fit_transform(df[existing_numeric_cols])
df.head()


Unnamed: 0,Timestamp,AmountReceived,ReceivingCurrency,AmountPaid,PaymentCurrency,PaymentFormat,IsLaundering,PatternType,FromBankAccount,ToBankAccount
0,0.228258,1.37898e-06,Ruble,1.37898e-06,Ruble,Credit Card,0,Not Laundering,48060AD3E0,17554806BD22D0
1,0.227708,1.158138e-06,Ruble,1.158138e-06,Ruble,Cheque,1,Unknown,701004287C8,217824807125310
2,0.000157,4.054062e-11,UK Pound,4.054062e-11,UK Pound,Reinvestment,0,GATHER-SCATTER,120292807DF0AE0,120292807DF0AE0
3,0.552047,4.245806e-13,Bitcoin,4.245806e-13,Bitcoin,Bitcoin,0,Not Laundering,220813741041,220813741041
4,0.202608,3.893786e-09,US Dollar,3.893786e-09,US Dollar,Credit Card,0,CYCLE,70100428660,11642807BE8740


In [10]:
categorical_cols = ['ReceivingCurrency', 'PaymentCurrency', 'PaymentFormat']
existing_categorical_cols = [col for col in categorical_cols if col in df.columns]

df = pd.get_dummies(df, columns=existing_categorical_cols, drop_first=True)
df.head()


Unnamed: 0,Timestamp,AmountReceived,AmountPaid,IsLaundering,PatternType,FromBankAccount,ToBankAccount,ReceivingCurrency_Bitcoin,ReceivingCurrency_Brazil Real,ReceivingCurrency_Canadian Dollar,...,PaymentCurrency_UK Pound,PaymentCurrency_US Dollar,PaymentCurrency_Yen,PaymentCurrency_Yuan,PaymentFormat_Bitcoin,PaymentFormat_Cash,PaymentFormat_Cheque,PaymentFormat_Credit Card,PaymentFormat_Reinvestment,PaymentFormat_Wire
0,0.228258,1.37898e-06,1.37898e-06,0,Not Laundering,48060AD3E0,17554806BD22D0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0.227708,1.158138e-06,1.158138e-06,1,Unknown,701004287C8,217824807125310,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,0.000157,4.054062e-11,4.054062e-11,0,GATHER-SCATTER,120292807DF0AE0,120292807DF0AE0,False,False,False,...,True,False,False,False,False,False,False,False,True,False
3,0.552047,4.245806e-13,4.245806e-13,0,Not Laundering,220813741041,220813741041,True,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0.202608,3.893786e-09,3.893786e-09,0,CYCLE,70100428660,11642807BE8740,False,False,False,...,False,True,False,False,False,False,False,True,False,False


In [11]:
df.shape

value_counts = df['IsLaundering'].value_counts()

print(value_counts)


IsLaundering
0    20708
1     5177
Name: count, dtype: int64


In [12]:
# Filter the rows where IsLaundering == 0
df_zeros = df[df['IsLaundering'] == 0].sample(n=7500, random_state=42)  # Sample 10,000 zeros
df_ones = df[df['IsLaundering'] == 1]
df_balanced = pd.concat([df_zeros, df_ones])
df= df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print(df['IsLaundering'].value_counts())


IsLaundering
0    7500
1    5177
Name: count, dtype: int64


In [14]:
G = nx.DiGraph()
for idx, row in df.iterrows():
    G.add_edge(row['FromBankAccount'], row['ToBankAccount'], idx=idx)

degree_dict = dict(G.degree())
betweenness_dict = nx.betweenness_centrality(G, normalized=True)
clustering_dict = nx.clustering(G.to_undirected())

try:
    import community as community_louvain
    partition = community_louvain.best_partition(G.to_undirected())
except:
    partition = {node: 0 for node in G.nodes()}

df['from_degree'] = df['FromBankAccount'].map(degree_dict).fillna(0)
df['to_degree'] = df['ToBankAccount'].map(degree_dict).fillna(0)

df['from_betweenness'] = df['FromBankAccount'].map(betweenness_dict).fillna(0)
df['to_betweenness'] = df['ToBankAccount'].map(betweenness_dict).fillna(0)

df['sender_clustering'] = df['FromBankAccount'].map(clustering_dict).fillna(0)
df['receiver_clustering'] = df['ToBankAccount'].map(clustering_dict).fillna(0)


df.head()


Unnamed: 0,Timestamp,AmountReceived,AmountPaid,IsLaundering,PatternType,FromBankAccount,ToBankAccount,ReceivingCurrency_Bitcoin,ReceivingCurrency_Brazil Real,ReceivingCurrency_Canadian Dollar,...,PaymentFormat_Cheque,PaymentFormat_Credit Card,PaymentFormat_Reinvestment,PaymentFormat_Wire,from_degree,to_degree,from_betweenness,to_betweenness,sender_clustering,receiver_clustering
0,0.07644,1.90317e-09,1.90317e-09,0,Not Laundering,3963380A3B3390,22189180A3B33E0,False,False,False,...,True,False,False,False,1,1,0.0,0.0,0.0,0.0
1,0.651583,1.598277e-07,1.598277e-07,1,SCATTER-GATHER,24803D94320,160180340DCF0,False,False,False,...,False,False,False,False,2,13,2.124678e-10,0.0,0.0,0.0
2,0.440962,1.101459e-07,1.101459e-07,1,GATHER-SCATTER,2174980154AAF0,21387800ADE000,False,False,False,...,False,False,False,False,27,1,6.104201e-07,0.0,0.0,0.0
3,0.26035,8.740984e-14,8.740984e-14,0,Not Laundering,70100428A51,153827814903261,True,False,False,...,False,False,False,False,36,1,0.0,0.0,0.0,0.0
4,0.031699,1.187935e-10,1.187935e-10,0,Not Laundering,21805080C71DF90,21805080C71DF90,False,False,False,...,False,False,True,False,2,2,0.0,0.0,0.0,0.0


In [15]:
df.to_csv('processed_amlsim.csv', index=False)

In [16]:
df_process= pd.read_csv("/kaggle/working/processed_amlsim.csv")
df_process.tail()

Unnamed: 0,Timestamp,AmountReceived,AmountPaid,IsLaundering,PatternType,FromBankAccount,ToBankAccount,ReceivingCurrency_Bitcoin,ReceivingCurrency_Brazil Real,ReceivingCurrency_Canadian Dollar,...,PaymentFormat_Cheque,PaymentFormat_Credit Card,PaymentFormat_Reinvestment,PaymentFormat_Wire,from_degree,to_degree,from_betweenness,to_betweenness,sender_clustering,receiver_clustering
12672,0.045212,7.782434e-08,7.782434e-08,1,GATHER-SCATTER,13347180CA21140,211808E44B10,False,False,False,...,False,False,False,False,1,25,0.0,5.38606e-07,0.0,0.0
12673,0.281955,1.032372e-10,1.032372e-10,0,Not Laundering,2191848073F1670,226331809939590,False,False,False,...,False,False,False,False,1,1,0.0,0.0,0.0,0.0
12674,0.304894,2.232563e-09,2.232563e-09,0,Not Laundering,210809083000,125607809D48E80,False,False,True,...,True,False,False,False,1,1,0.0,0.0,0.0,0.0
12675,0.246602,3.304886e-09,3.304886e-09,0,Not Laundering,4403806A0E9F0,12714280D0B93C0,False,False,False,...,True,False,False,False,1,1,0.0,0.0,0.0,0.0
12676,0.521329,2.331064e-06,2.331064e-06,0,Not Laundering,12330880A5E9640,29794811AF5270,False,False,False,...,False,False,False,False,1,1,0.0,0.0,0.0,0.0


In [24]:
# Get a count of each unique value in 'sender_clustering'
print(df['sender_clustering'].value_counts())


sender_clustering
0.000000    12388
1.000000      131
0.333333       42
0.300000       18
0.395604       17
0.285714       16
0.196970       12
0.400000        8
0.666667        7
0.714286        6
0.500000        6
0.833333        5
0.357143        4
0.600000        3
0.866667        3
0.700000        3
0.176471        2
0.179487        2
0.900000        2
0.200000        2
Name: count, dtype: int64


In [27]:
import networkx as nx
from community import community_louvain
import pandas as pd



# Use Louvain to partition the graph and assign cluster IDs
partition = community_louvain.best_partition(G.to_undirected())

# Now, 'partition' will have cluster IDs for each node
# Map the 'FromBankAccount' to their respective cluster IDs
df['ClusterID'] = df['FromBankAccount'].map(partition)

# Check the result
print(df[['FromBankAccount', 'ClusterID']].head(20))


    FromBankAccount  ClusterID
0    3963380A3B3390          0
1       24803D94320          1
2    2174980154AAF0        114
3       70100428A51          3
4   21805080C71DF90          4
5    12381801318F70          5
6    2943580D955F40          6
7       70100428660          7
8      513804FA8C30          8
9      126812D1FE11          9
10  210404805448AC0         10
11   29251806E90D80         11
12    158880E267B00         12
13   219188089DC420         13
14   167888041F5380         14
15      701004288A0         15
16   21568808BC5830         16
17   22345800E2C3D0         17
18   2763780474C250         18
19     11280E574380         19


In [28]:
# Assign the 'ClusterID' based on the Louvain communities
df['ClusterID'].head()

0      0
1      1
2    114
3      3
4      4
Name: ClusterID, dtype: int64

In [29]:
df.head()

Unnamed: 0,Timestamp,AmountReceived,AmountPaid,IsLaundering,PatternType,FromBankAccount,ToBankAccount,ReceivingCurrency_Bitcoin,ReceivingCurrency_Brazil Real,ReceivingCurrency_Canadian Dollar,...,PaymentFormat_Credit Card,PaymentFormat_Reinvestment,PaymentFormat_Wire,from_degree,to_degree,from_betweenness,to_betweenness,sender_clustering,receiver_clustering,ClusterID
0,0.07644,1.90317e-09,1.90317e-09,0,Not Laundering,3963380A3B3390,22189180A3B33E0,False,False,False,...,False,False,False,1,1,0.0,0.0,0.0,0.0,0
1,0.651583,1.598277e-07,1.598277e-07,1,SCATTER-GATHER,24803D94320,160180340DCF0,False,False,False,...,False,False,False,2,13,2.124678e-10,0.0,0.0,0.0,1
2,0.440962,1.101459e-07,1.101459e-07,1,GATHER-SCATTER,2174980154AAF0,21387800ADE000,False,False,False,...,False,False,False,27,1,6.104201e-07,0.0,0.0,0.0,114
3,0.26035,8.740984e-14,8.740984e-14,0,Not Laundering,70100428A51,153827814903261,True,False,False,...,False,False,False,36,1,0.0,0.0,0.0,0.0,3
4,0.031699,1.187935e-10,1.187935e-10,0,Not Laundering,21805080C71DF90,21805080C71DF90,False,False,False,...,False,True,False,2,2,0.0,0.0,0.0,0.0,4


In [30]:
df.to_csv('processed_amlsim_final.csv', index=False)
