In [1]:
%load_ext autoreload
%autoreload 2
# add parent directory to the path
import sys
sys.path.append('../')

In [2]:
"""Script to pre-process AML transaction data to be used in training and inference."""
import os
import argparse
import logging
from datetime import datetime

import pandas as pd
import numpy as np
from icecream import ic

import logging
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Specify the log message format
    datefmt='%Y-%m-%d %H:%M:%S',  # Specify the date format
    handlers=[
        #logging.FileHandler('app.log'),  # Log messages to a file
        logging.StreamHandler()  # Also output log messages to the console
    ]
)
logger = logging.getLogger(__name__)

In [18]:
df = pd.read_csv('/mnt/data/ibm-transactions-for-anti-money-laundering-aml/HI-Large_Trans.csv')
# print the histogram of Amount Recieved column
print(df['Amount Received'].hist(bins=10000, color='blue', edgecolor='black'))
# print min and max
print(df['Amount Received'].min())
print(df['Amount Received'].max())

: 

In [None]:
# Group by 'Account' and aggregate 'From Bank'
grouped = df.groupby('Account')['From Bank'].unique()

# Find accounts with multiple banks
duplicated_accounts = grouped[grouped.apply(lambda x: len(x) > 1)]

print(duplicated_accounts)

Account
80C746490        [20703, 19200]
81A27EB30      [160226, 264674]
81F71BE10       [60269, 157344]
82954CE40      [144465, 143318]
83212BF10    [2125527, 1135003]
83214F0E0    [2129483, 2133788]
8347C8F30    [2142416, 2142968]
83510AC20       [51969, 243938]
83B7523F0     [2121060, 298409]
83FFB8A70     [172737, 1166034]
84ADF03D0       [164028, 38633]
851912640      [246267, 238428]
Name: From Bank, dtype: object


In [13]:
import time
from src.datasets.util.mask import PretrainType
from src.datasets import IBMTransactionsAML
dataset = IBMTransactionsAML(
    root='/mnt/data/ibm-transactions-for-anti-money-laundering-aml/dummy-c.csv', 
    pretrain={PretrainType.MASK, PretrainType.LINK_PRED},
    split_type='daily_temporal',
    splits=[0.6,0.2,0.2], 
    khop_neighbors=[100,100],
    ports=True
)
logger.info(f"Materialzing dataset...")
s = time.time()
dataset.materialize()
logger.info(f"Materialized in {time.time() - s:.2f} seconds")
dataset.df.head(5)
num_columns = len(dataset.num_columns)
cat_columns = len(dataset.cat_columns)
logger.info(f"Number of numerical columns: {num_columns}")
logger.info(f"Number of categorical columns: {cat_columns}")

2024-08-01 13:15:51 - INFO - Creating graph...
2024-08-01 13:15:52 - INFO - Graph created in 0.87 seconds.
2024-08-01 13:15:52 - INFO - Adding ports...
2024-08-01 13:16:14 - INFO - Ports added in 22.65 seconds.
2024-08-01 13:16:14 - INFO - Applying mask...
2024-08-01 13:16:14 - INFO - Loading masked columns from /mnt/data/ibm-transactions-for-anti-money-laundering-aml/dummy-c.csv.npy
2024-08-01 13:16:21 - INFO - Mask applied in 6.70 seconds.
2024-08-01 13:16:22 - INFO - Materialzing dataset...


['Amount Received', 'Receiving Currency', 'Payment Format']


2024-08-01 13:16:33 - INFO - Materialized in 11.13 seconds
2024-08-01 13:16:33 - INFO - Number of numerical columns: 1
2024-08-01 13:16:33 - INFO - Number of categorical columns: 2


In [9]:
from torch_frame.data import DataLoader
train_dataset, val_dataset, test_dataset = dataset.split()
train_loader = DataLoader(train_dataset.tensor_frame, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset.tensor_frame, batch_size=1024, shuffle=False)
test_loader = DataLoader(test_dataset.tensor_frame, batch_size=1024, shuffle=False)
logger.info(f"train_loader size: {len(train_loader)}")
logger.info(f"val_loader size: {len(val_loader)}")
logger.info(f"test_loader size: {len(test_loader)}")

2024-08-01 13:10:32 - INFO - train_loader size: 293
2024-08-01 13:10:32 - INFO - val_loader size: 98
2024-08-01 13:10:32 - INFO - test_loader size: 98


In [10]:
item = next(iter(train_loader))
logger.info(f"item: {item.y}")

2024-08-01 13:10:34 - INFO - item: tensor([[0.0000e+00, 1.0000e+00, 2.5602e+05, 4.5660e+03, 4.1174e+05],
        [2.0000e+00, 2.0000e+00, 1.5940e+05, 1.6216e+05, 4.2245e+05],
        [0.0000e+00, 2.0000e+00, 2.3275e+05, 2.3275e+05, 3.1179e+05],
        ...,
        [7.0000e+00, 1.0000e+00, 1.6315e+05, 1.6314e+05, 2.1846e+05],
        [3.6196e-01, 0.0000e+00, 1.7007e+05, 1.7007e+05, 2.2775e+05],
        [2.0000e+00, 2.0000e+00, 2.1114e+05, 2.1044e+05, 2.8283e+05]],
       dtype=torch.float64)
