## Install libraries

In [None]:
!pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cpu.html

In [None]:
!pip install networkx

In [None]:
!pip install pygod 

In [6]:
# check the memory - the graphs can be large
!free -h

              total        used        free      shared  buff/cache   available
Mem:            83G        787M         81G        1.2M        885M         81G
Swap:            0B          0B          0B


In [1]:
import sys
import os
import pandas as pd

## Mount Google Cloud Storage buckets

In [2]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

In [8]:
!mkdir /content/network-data/

In [9]:
!gcsfuse tilek-bg-export /content/network-data

2022/12/11 18:36:26.561966 Start gcsfuse/0.41.9 (Go version go1.18.4) for app "" using mount point: /content/network-data
2022/12/11 18:36:26.574041 Opening GCS connection...
2022/12/11 18:36:26.752713 Mounting file system "tilek-bg-export"...
2022/12/11 18:36:26.753182 File system has been successfully mounted.


In [9]:
!mkdir /content/address-data/

mkdir: cannot create directory ‘/content/address-data/’: File exists


In [11]:
!gcsfuse tilek-defi-network-data /content/address-data

2022/12/11 18:36:33.346682 Start gcsfuse/0.41.9 (Go version go1.18.4) for app "" using mount point: /content/address-data
2022/12/11 18:36:33.358826 Opening GCS connection...
2022/12/11 18:36:33.547736 Mounting file system "tilek-defi-network-data"...
2022/12/11 18:36:33.548003 File system has been successfully mounted.


In [None]:
!ls /content/network-data/

In [3]:
!ls /content/address-data/

address_list.csv  edge_data  fraud-detection-dataset  node_data


## Fetch data

In [2]:
# sample a subset of nodes

df = pd.read_csv("/content/address-data/fraud-detection-dataset/transaction_dataset.csv", sep=',')

sample_df = df.sample(n=100)
sample_df[sample_df.FLAG==1] 

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
9359,9359,1698,0xc71610821ac6042e52be9d7d76df4c425f30f41f,1,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
8152,8152,491,0x379ce20c018fb6301c1872c429ec7270ffa4dc5b,1,4.27,0.0,34.17,8,1,0,...,3000.0,7000.0,3784.217286,0.0,0.0,0.0,1.0,2.0,Golem,Golem
9586,9586,1925,0xe2bca95f5f33a981aaf51372b9596ff3837e0a5c,1,0.0,0.0,0.0,0,0,0,...,,,,,,,,,,
7848,7848,187,0x1494403137159bb0dc545d11963fdf797ea1ecab,1,0.0,19.35,25707.98,1,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,INS Promo
7923,7923,262,0x1c40d1a1cac7c586b9509c565296f91c8441af9f,1,0.0,183.01,2990.05,1,7,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
8568,8568,907,0x67fdeda858e1160ba88dda9b9626035c65dc7607,1,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
7820,7820,159,0x1038945a8acf315ffc97d54d3b4b72f3ed2fe2bb,1,3754.31,9279.18,157959.3,5,15,0,...,,,,,,,,,,
9018,9018,1357,0x9d4b62503b4b7993182323effe6245f6d77e4413,1,0.0,726.18,386616.2,1,18,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,,GSENetwork
9466,9466,1805,0xd42c3960d30f36c5a4e58d591180f58a34354641,1,0.0,0.0,0.0,0,0,0,...,,,,,,,,,,
9104,9104,1443,0xa746d613d9b3a267ad470e5ce980dbc12473247c,1,0.0,0.0,0.0,0,0,0,...,,,,,,,,,,


In [None]:
flags = {}

for ind, row in df.iterrows():
  flags[row['Address']] = row['FLAG']

# flags

In [3]:
selected_addr = set(sample_df['Address'].tolist())
# selected_addr

In [4]:
address_index = {}
selected_addr_list = list(selected_addr)

for i in range(len(selected_addr_list)):
  address_index[selected_addr_list[i]] = i

In [None]:
root_data_dir = "/content/network-data/"

node_a = []
node_b = []
timestamps = []
values = []

files = os.listdir(root_data_dir)

count = 0

for f in files:

  if f.find('.csv') >= 0:

    print('processing: ', f)

    df = pd.read_csv(root_data_dir+f)
    df = df.dropna()

    df['tstamp'] =  pd.to_datetime(df['block_timestamp'], format='%Y-%m-%d %H:%M:%S UTC')
    df['epochs'] = df['tstamp'].map(pd.Timestamp.timestamp).astype('int32')

    df.value = df.value.astype('float')
    df.value = df.value / 10**18

    from_addr = df['from_address'].tolist()
    to_addr = df['to_address'].tolist()
    cur_timestamps = df['epochs'].tolist()
    cur_val= df['value'].tolist()

    for i in range(len(from_addr)):
      if from_addr[i] in selected_addr or to_addr[i] in selected_addr:

        if from_addr[i] in address_index:
          cur_from_index = address_index[from_addr[i]]
        else:
          cur_from_index = len(address_index)
          address_index[from_addr[i]] = cur_from_index

        if to_addr[i] in address_index:
          cur_to_index = address_index[to_addr[i]]
        else:
          cur_to_index = len(address_index)
          address_index[to_addr[i]] = cur_to_index

        node_a.append(cur_from_index)
        node_b.append(cur_to_index)

        timestamps.append(cur_timestamps[i])
        values.append(cur_val[i])

    # count += 1
    # if count == 10:
    #   break

    del df

In [6]:
len(node_a), len(node_b), len(timestamps), len(values), len(address_index)

(37702, 37702, 37702, 37702, 1866)

In [7]:
len(selected_addr)

100

## Large graph (SKIP)

In [None]:
address_index = {}

with open("/content/address-data/address_list.csv", "r") as f:
  address_lines = f.readlines()

for i in range(1, len(address_lines)):
  cur_addr = address_lines[i].replace('\n', '')
  address_index[cur_addr] = i-1

del address_lines

mem_size = sys.getsizeof(address_index)
print('lookup table size: ', round(mem_size/1024/1025, 2), ' MB')

lookup table size:  319.69  MB


In [None]:
root_data_dir = "/content/network-data/"

node_a = []
node_b = []

files = os.listdir(root_data_dir)

for f in files:

  print('processing: ', f)

  df = pd.read_csv(root_data_dir+f)
  df = df.dropna()
  df['from_addr_idx'] = df['from_address'].map(address_index)
  df['to_addr_idx'] = df['to_address'].map(address_index)

  df.value = df.value.astype('float')
  df.value = df.value / 10**18

  df = df.astype({'from_addr_idx': 'Int64', 'to_addr_idx': 'Int64'})

  node_a.extend(df['from_addr_idx'].tolist())
  node_b.extend(df['to_addr_idx'].tolist())

  del df

In [None]:
with open('/content/node_a.lst', "w") as f:
  for node in node_a:
    f.write(str(node)+'\n')

In [None]:
with open('/content/node_b.lst', "w") as f:
  for node in node_b:
    f.write(str(node)+'\n')

In [None]:
!cp /content/node_a.lst /content/address-data/node_data/

In [None]:
!cp /content/node_b.lst /content/address-data/node_data/

In [None]:
len(node_a), len(node_b)

(77920823, 77920823)

In [None]:
sys.getsizeof(node_a)/1024/1024 + sys.getsizeof(node_b)/1024/1024

1317.0179290771484

1317.0179290771484

In [None]:
root_data_dir = "/content/network-data/"

values = []

files = os.listdir(root_data_dir)

for f in files:

  print('processing: ', f)

  df = pd.read_csv(root_data_dir+f)
  df = df.dropna()

  df.value = df.value.astype('float')
  df.value = df.value / 10**18

  values.extend(df['value'].tolist())

  del df

In [None]:
with open('/content/values.lst', "w") as f:
  for val in values:
    f.write(str(val)+'\n')

In [None]:
!head /content/values.lst

In [None]:
len(values)

77920823

In [None]:
!cp /content/values.lst /content/address-data/edge_data/

In [None]:
root_data_dir = "/content/network-data/"

timestamps = []

files = os.listdir(root_data_dir)

for f in files:

  print('processing: ', f)

  df = pd.read_csv(root_data_dir+f)
  df = df.dropna()

  df['tstamp'] =  pd.to_datetime(df['block_timestamp'], format='%Y-%m-%d %H:%M:%S UTC')
  df['epochs'] = df['tstamp'].map(pd.Timestamp.timestamp).astype('int32')

  timestamps.extend(df['epochs'].tolist())

  del df

In [None]:
with open('/content/timestamps.lst', "w") as f:
  for tstamp in timestamps:
    f.write(str(tstamp)+'\n')

In [None]:
!head /content/timestamps.lst

In [None]:
!cp /content/timestamps.lst /content/address-data/edge_data/

In [None]:
# df = pd.read_csv("/content/network-data/network-export-000000000000.csv")
# df['tstamp'] =  pd.to_datetime(df['block_timestamp'], format='%Y-%m-%d %H:%M:%S UTC')
# df['epochs'] = df['tstamp'].map(pd.Timestamp.timestamp).astype('int32')
# df.head()

In [None]:
def read_list(path, type):

  with open(path, "r") as f:
    lines = f.readlines()
    result = [type(line) for line in lines]

    return result    

In [None]:
node_a = read_list("/content/address-data/node_data/node_a.lst", int)
len(node_a)

77920823

In [None]:
node_b = read_list("/content/address-data/node_data/node_b.lst", int)
len(node_b)

77920823

In [None]:
timestamps = read_list("/content/address-data/edge_data/timestamps.lst", int)
len(timestamps)

77920823

In [None]:
values = read_list("/content/address-data/edge_data/values.lst", float)
len(values)

77920823

## Build the graph

In [8]:
import torch
from torch_geometric.data import Data

num_nodes = len(address_index.keys())
print('number of nodes: ', num_nodes)


number of nodes:  1866


In [9]:
edge_index = torch.tensor([node_a, node_b], dtype=torch.long)
x = torch.tensor([[i] for i in range(num_nodes)], dtype=torch.float)

In [10]:
edge_attr = torch.tensor([timestamps, values])
print(edge_attr.shape)
edge_attr = edge_attr.transpose(0, 1)
print(edge_attr.shape)

torch.Size([2, 37702])
torch.Size([37702, 2])


In [11]:
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

In [12]:
data.num_nodes

1866

In [None]:
data.num_edges

11303

In [None]:
data.num_node_features

1

In [None]:
data.has_isolated_nodes()

True

In [None]:
data.has_self_loops()

True

In [None]:
data.is_directed()

True

### Save the graph

In [21]:
import pickle

with open("/content/network.pkl", "wb") as f:
  pickle.dump(data, f)

In [22]:
!cp /content/network.pkl /content/network-data/graph/

In [None]:
import pickle

with open("/content/network.pkl", "r") as f:
  data = pickle.load(f)

## Train models

In [58]:
models = []

In [59]:
# 1
from pygod.models import AdONE

model = AdONE()
model.fit(data)
models.append({
    'model_name': 'AdONE',
    'model': model    
})

In [60]:
# 2
from pygod.models import ANOMALOUS

model = ANOMALOUS()
model.fit(data)
models.append({
    'model_name': 'ANOMALOUS',
    'model': model    
})

In [61]:
# 3
from pygod.models import CoLA

model = CoLA()
model.fit(data)
models.append({
    'model_name': 'CoLA',
    'model': model    
})

In [62]:
# 4
from pygod.models import CONAD

model = CONAD()
model.fit(data)
models.append({
    'model_name': 'CONAD',
    'model': model    
})

In [63]:
# 5
from pygod.models import DOMINANT

model = DOMINANT(num_layers=4, epoch=20)
model.fit(data)
models.append({
    'model_name': 'DOMINANT',
    'model': model    
})

In [64]:
# 6
from pygod.models import DONE

model = DONE()
model.fit(data)
models.append({
    'model_name': 'DONE',
    'model': model    
})

In [65]:
# 7
from pygod.models import GAAN

model = GAAN()
model.fit(data)
models.append({
    'model_name': 'GAAN',
    'model': model    
})

In [66]:
# 8
from pygod.models import GCNAE

model = GCNAE()
model.fit(data)
models.append({
    'model_name': 'GCNAE',
    'model': model    
})

In [68]:
# 9
from pygod.models import MLPAE

model = MLPAE()
model.fit(data)
models.append({
    'model_name': 'MLPAE',
    'model': model    
})

In [70]:
# 10
from pygod.models import Radar

model = Radar()
model.fit(data)
models.append({
    'model_name': 'Radar',
    'model': model    
})

In [71]:
# 11
from pygod.models import SCAN

model = SCAN()
model.fit(data)
models.append({
    'model_name': 'SCAN',
    'model': model    
})

In [91]:
len(models)

11

In [73]:
models

[{'model_name': 'AdONE', 'model': AdONE(a1=0.2, a2=0.2, a3=0.2, a4=0.2, a5=0.2,
     act=<function leaky_relu at 0x7ff8902cd940>, batch_size=1866,
     contamination=0.1, dropout=0.0, epoch=5, gpu=None, hid_dim=32, lr=0.005,
     num_layers=4, num_neigh=-1, verbose=False, weight_decay=0.0)},
 {'model_name': 'ANOMALOUS',
  'model': ANOMALOUS(contamination=0.1, epoch=100, gamma=1.0, gpu=None, lr=0.004,
       verbose=False, weight_decay=0.01)},
 {'model_name': 'CoLA',
  'model': CoLA(batch_size=0, contamination=0.1, embedding_dim=64, epoch=None, gpu=None,
     lr=0.001, negsamp_ratio=1, readout='avg', subgraph_size=4,
     verbose=False, weight_decay=0.0)},
 {'model_name': 'CONAD',
  'model': CONAD(act=<function relu at 0x7ff8902cd550>, alpha=tensor(0.0157),
     batch_size=1866, contamination=0.1, dropout=0.3, epoch=5, eta=0.5, f=10,
     gpu=None, hid_dim=64, k=50, lr=0.005, m=50, margin=None, num_layers=4,
     num_neigh=-1, r=0.2, verbose=False, weight_decay=0.0)},
 {'model_name': 'D

In [87]:
import pandas as pd

output_df = pd.DataFrame()
output_df['address'] = selected_addr_list
output_df

Unnamed: 0,address
0,0xbcb38d60623f688badfe019b1d0ef73fd9ea34dc
1,0x09004228e526e9a42d71495cc1ada467b89dd5d9
2,0x6db4349c2bbdee525e080f63096fc30754cd95d9
3,0x5454c084e0f96ed65a84f3b36f9fcd39f4263902
4,0xa746d613d9b3a267ad470e5ce980dbc12473247c
...,...
95,0x09b2a7b5a3c2c292109c521b4cd491f3ec0a5072
96,0xc62662194e68738cd504ecf837c25b67faefa659
97,0x4a723c29e6a8c998459faeb56fbaf7f98ad14751
98,0x2b260d70b726b644299af03fe60a646d9df537ad


In [88]:
node_label = []

for addr in selected_addr_list:
  node_label.append(flags[addr])

output_df['label'] = node_label

In [89]:
for model in models:
  prediction = model['model'].predict_proba(data, method='unify')
  output_df[model['model_name']] = prediction[:100,1]

output_df

  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


Unnamed: 0,address,label,AdONE,ANOMALOUS,CoLA,CONAD,DOMINANT,DONE,GAAN,GCNAE,MLPAE,Radar,SCAN
0,0xbcb38d60623f688badfe019b1d0ef73fd9ea34dc,0,1.000000,0.0,0.032322,1.000000,0.999983,1.000000,0.0,1.000000,0.0,0.0,0.00000
1,0x09004228e526e9a42d71495cc1ada467b89dd5d9,0,0.000000,0.0,0.032322,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
2,0x6db4349c2bbdee525e080f63096fc30754cd95d9,0,0.000000,0.0,0.032322,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
3,0x5454c084e0f96ed65a84f3b36f9fcd39f4263902,0,0.087058,0.0,0.032322,0.999876,0.505755,0.065332,0.0,0.653134,0.0,0.0,0.00000
4,0xa746d613d9b3a267ad470e5ce980dbc12473247c,1,0.000000,0.0,1.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0x09b2a7b5a3c2c292109c521b4cd491f3ec0a5072,0,0.000000,0.0,0.032322,0.818003,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000
96,0xc62662194e68738cd504ecf837c25b67faefa659,0,0.000000,0.0,0.032322,0.999994,0.644736,0.000000,0.0,0.793085,0.0,0.0,0.00000
97,0x4a723c29e6a8c998459faeb56fbaf7f98ad14751,0,0.000000,0.0,0.032322,0.286015,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.99812
98,0x2b260d70b726b644299af03fe60a646d9df537ad,0,0.000000,0.0,0.032322,0.452573,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000


In [90]:
output_df.to_csv("node_outlier_scores.csv", sep='\t', header=True, index=False)