In [1]:
%load_ext autoreload
%autoreload 2

In [38]:
import os

import torch
import torch.nn.functional as F
import numpy as np

from torch_frame.data import DataLoader
from torch_geometric.data import Data
from src.nn.gnn.model import GINe

from tqdm import tqdm
from transformers import get_inverse_sqrt_schedule

from torch.nn import BCELoss

import wandb
from icecream import ic

In [3]:
seed = 42
batch_size = 2
lr = 5e-4
eps = 1e-8
epochs = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'
args = {
    'testing': True,
    'batch_size': batch_size,
    'seed': seed,
    'device': device,
}

In [4]:
wandb.login()
run = wandb.init(
    mode="disabled" if args['testing'] else "online",
    project=f"rel-mm", 
    name="model=GINe,dataset=IBM-AML_Hi_Sm,objective=lp", 
    config=args
)

[34m[1mwandb[0m: Currently logged in as: [33maakyildiz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# When running on the CuDNN backend, two further options must be set
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# Set a fixed value for the hash seed
os.environ["PYTHONHASHSEED"] = str(seed)

In [6]:
from src.datasets import IBMTransactionsAML
dataset = IBMTransactionsAML(root='/mnt/data/ibm-transactions-for-anti-money-laundering-aml/dummy.csv')
#dataset = IBMTransactionsAML(root='/mnt/data/ibm-transactions-for-anti-money-laundering-aml/HI-Small_Trans-cleaned.csv', pretrain=pretrain, split_type='temporal', splits=data_split)
ic(dataset)
dataset.materialize()
dataset.df.head(5)

ic| dataset: IBMTransactionsAML()


Unnamed: 0,Timestamp,From Bank,From ID,To Bank,To ID,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,split
0,1200,B_10,8000EBD30,B_10,8000EBD30,3.53372e-09,US Dollar,3.53372e-09,US Dollar,Reinvestment,0,0
1,1200,B_3208,8000F4580,B_1,8000F5340,9.556511e-15,US Dollar,9.556511e-15,US Dollar,Cheque,0,0
2,0,B_3209,8000F4670,B_3209,8000F4670,1.402613e-08,US Dollar,1.402613e-08,US Dollar,Reinvestment,0,0
3,120,B_12,8000F5030,B_12,8000F5030,2.682752e-09,US Dollar,2.682752e-09,US Dollar,Reinvestment,0,0
4,360,B_10,8000F5200,B_10,8000F5200,3.505963e-08,US Dollar,3.505963e-08,US Dollar,Reinvestment,0,0


In [7]:
train_dataset, val_dataset, test_dataset = dataset.split()

In [8]:
train_tensor_frame = train_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=batch_size, shuffle=True)
val_tensor_frame = val_dataset.tensor_frame
val_loader = DataLoader(val_tensor_frame, batch_size=batch_size, shuffle=True)
test_tensor_frame = test_dataset.tensor_frame
test_loader = DataLoader(test_tensor_frame, batch_size=batch_size, shuffle=True)


In [9]:
# create a pytorch geometric data object from the tensorframe using the From ID and To ID columns as the edge index
source = train_tensor_frame.get_col_feat('From ID')
destination = train_tensor_frame.get_col_feat('To ID')

#create dummy node features
num_nodes = np.unique(np.concatenate([source, destination])).shape[0]
ic(num_nodes)
node_feat = torch.ones(num_nodes)

edge_index = torch.cat([source, destination], dim=1).t()
ic(edge_index.shape)
g = Data(node_feat, edge_index=edge_index, edge_attr=train_tensor_frame)

ic| num_nodes: 298015
ic| edge_index.shape: torch.Size([2, 499843])


In [10]:
model = GINe(num_features=1, num_gnn_layers=3, edge_dim=train_dataset.tensor_frame.num_cols-3)
model.to(args['device'])
learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
ic(learnable_params)
wandb.log({"learnable_params": learnable_params})

ic| learnable_params: 125177


In [11]:
batch = next(iter(train_loader))
ic(batch)

ic| batch: TensorFrame(
             num_cols=10,
             num_rows=2,
             categorical (7): ['From Bank', 'From ID', 'Payment Currency', 'Payment Format', 'Receiving Currency', 'To Bank', 'To ID'],
             timestamp (1): ['Timestamp']

,
             numerical (2): ['Amount Paid', 'Amount Received'],
             has_target=True,
             device='cpu',
           )


TensorFrame(
  num_cols=10,
  num_rows=2,
  categorical (7): ['From Bank', 'From ID', 'Payment Currency', 'Payment Format', 'Receiving Currency', 'To Bank', 'To ID'],
  timestamp (1): ['Timestamp'],
  numerical (2): ['Amount Paid', 'Amount Received'],
  has_target=True,
  device='cpu',
)

In [13]:
def get_gnn_inputs(batch):
    source = batch.get_col_feat('From ID')
    destination = batch.get_col_feat('To ID')
    ic(source, destination)
    feat_cols = train_dataset.feat_cols

    # generalize the trainable columns
    feat_cols.remove('Timestamp')
    feat_cols.remove('From ID')
    feat_cols.remove('To ID')

    # a very crude approach
    feats = [batch.get_col_feat(col_name) for col_name in feat_cols]
    edge_attr = torch.cat(feats, dim=1).to(device)
    nodes = torch.unique(torch.cat([source, destination]))
    num_nodes = nodes.shape[0]
    ic(num_nodes)

    n_id_map = {value.item(): index for index, value in enumerate(nodes)}
    ic(n_id_map)
    local_source = torch.tensor([n_id_map[node.item()] for node in source], dtype=torch.long)
    local_destination = torch.tensor([n_id_map[node.item()] for node in destination], dtype=torch.long)
    ic(local_source, local_destination)
    edge_index = torch.cat((local_source.unsqueeze(0), local_destination.unsqueeze(0))).to(device)
    ic(edge_index)
    ic(edge_index.shape)
    node_feats = torch.ones(num_nodes).view(-1,num_nodes).t().to(device)
    ic(node_feats, node_feats.shape)
    return edge_index, edge_attr, node_feats
edge_index, edge_attr, node_feats = get_gnn_inputs(batch)

ic| source: tensor([[ 76178],
                    [209722]]

)
    destination: tensor([[ 76536],
                         [223466]])
ic| num_nodes: 4
ic| n_id_map: {76178: 0, 76536: 1, 209722: 2, 223466: 3}
ic| local_source: tensor([0, 2]), local_destination: tensor([1, 3])
ic| edge_index: tensor([[0, 2],
                        [1, 3]], device='cuda:0')
ic| edge_index.shape: torch.Size([2, 2])
ic| node_feats: tensor([[1.],
                        [1.],
                        [1.],
                        [1.]], device='cuda:0')
    node_feats.shape: torch.Size([4, 1])


In [41]:
# Prepare optimizer and lr scheduler
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)
scheduler = get_inverse_sqrt_schedule(optimizer, num_warmup_steps=0, timescale=1000)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

def calc_loss(pred, y):
    y = torch.nn.functional.one_hot(y.to(torch.int64), num_classes=2).to(torch.float32)
    ic(pred)
    ic(y)
    return F.binary_cross_entropy(pred, y)

def train(epoc: int) -> float:
    model.train()
    loss_accum = total_count = 0

    with tqdm(train_loader, desc=f'Epoch {epoc}') as t:
        for tf in t:
            tf = tf.to(device)
            # import sys
            # sys.exit()
            edge_index, edge_attr, node_feats = get_gnn_inputs(tf)
            pred = model(node_feats, edge_index, edge_attr)
            ic(tf.y)
            ic(pred)
            loss = calc_loss(pred, tf.y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_accum += float(loss) * len(tf.y)
            total_count += len(tf.y)
            t.set_postfix(loss=f'{loss_accum/total_count:.4f}')
            del pred
            del tf
        wandb.log({"train_loss": loss_accum/total_count})
    return loss_accum / total_count

@torch.no_grad()
def test(loader: DataLoader, dataset_name) -> float:
    model.eval()
    accum_acc = 0
    loss_accum = 0
    total_count = 0
    with tqdm(loader, desc=f'Evaluating') as t:
        for tf in t:
            tf = tf.to(device)
            pred = model(tf)
            loss = calc_loss(pred, tf.y)
            loss_accum += float(loss) * len(tf.y)
            total_count += len(tf.y)
            t.set_postfix(accuracy=f'{accum_acc/total_count:.4f}')
        wandb.log({f"{dataset_name}_accuracy": accum_acc/total_count})
        del tf
        del pred
        accuracy = accum_acc / total_count
        return accuracy

In [42]:
for epoch in range(1, epochs + 1):
    train_loss = train(epoch)
    train_metric = test(train_loader, "train")
    val_metric = test(val_loader, "val")
    #test_metric = test(test_loader, "test")
    ic(
        train_loss, 
        train_metric, 
        val_metric, 
        #test_metric
    )

Epoch 1:   0%|                                                                                                                                                                                                          | 0/249922 [00:00<?, ?it/s]

ic| source: tensor([[5277],
                    [3441]], device='cuda:0')
    destination: tensor([[10764],
                         [63226]], device='cuda:0')
ic| num_nodes: 4
ic| n_id_map: {3441: 0, 5277: 1, 10764: 2, 63226: 3}
ic| local_source: tensor([1, 0]), local_destination: tensor([2, 3])
ic| edge_index: tensor([[1, 0],
                        [2, 3]], device='cuda:0')
ic| edge_index.shape: torch.Size([2, 2])
ic| node_feats: tensor([[1.],
                        [1.],
                        [1.],
                        [1.]], device='cuda:0')
    node_feats.shape: torch.Size([4, 1])
ic| tf.y: tensor([0, 0], device='cuda:0')
ic| pred: tensor([[1.0000, 0.6141],
                  [0.7087, 0.7228]], device='cuda:0', grad_fn=<SigmoidBackward0>)
ic| pred: tensor([[1.0000, 0.6141],
                  [0.7087, 0.7228]], device='cuda:0', grad_fn=<SigmoidBackward0>)
ic| y: tensor([[1., 0.],
               [1., 0.]], device='cuda:0')
Epoch 1:   0%|                                        