In [38]:
import dgl
import torch
import numpy as np
import os
import random
import pandas as pd
import bidict

def set_seed(seed=3407):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

In [39]:
prefix = '/data/sx/NFTGraph'

In [40]:
dfnodes = pd.read_csv(prefix+'/raw_data/tinynodes.csv')

In [41]:
dfnodes

Unnamed: 0,addr,OutCnt,OutAmount,OutValue,OutTransFee,InCnt,InAmount,InValue,InTransFee,label
0,0x0000000000000000000000000000000000000000,2650186.0,7217724.0,7.123250e+08,2.160977e+07,595830.0,2410795.0,2.029615e+07,4.879685e+06,0
1,0x4b3406a41399c7fd2ba65cbc93697ad9e7ea61e5,20407.0,22644.0,0.000000e+00,1.306836e+05,20407.0,22644.0,0.000000e+00,1.306836e+05,0
2,0x000000000000000000000000000000000000dead,6.0,12.0,0.000000e+00,2.842000e+01,39553.0,47223.0,1.036783e+05,1.821448e+05,0
3,0xeca9d81a4dc7119a40481cff4e7e24dd0aaf56bd,1090.0,1090.0,0.000000e+00,1.274590e+03,35295.0,36508.0,0.000000e+00,2.018945e+04,0
4,0x381e840f4ebe33d0153e9a312105554594a98c42,34524.0,36592.0,0.000000e+00,2.083176e+05,442.0,2494.0,0.000000e+00,2.374280e+03,0
...,...,...,...,...,...,...,...,...,...,...
19995,0x5a1d741b79d30c33021cf9d07ab59a37a10b4d1f,11.0,17.0,2.264330e+03,2.911300e+02,52.0,3320.0,2.773610e+03,2.299100e+02,0
19996,0x479db4dac1f196bceb97d52e99c3f4959d93b18b,19.0,23.0,1.110640e+03,1.290900e+02,44.0,44.0,0.000000e+00,1.353200e+02,0
19997,0x6fb9a82b00ccbe0e0cf06984d902cbeb946a44ca,17.0,17.0,4.199727e+03,2.130217e+02,46.0,46.0,3.892785e+03,3.892402e+02,0
19998,0xa126b4ad2328055ab7c01ce87a29a17b9f97e85a,15.0,116.0,1.841844e+04,2.348217e+02,48.0,57.0,9.970242e+03,2.797561e+02,0


In [42]:
node_dict = bidict.bidict()

In [43]:
labels = dfnodes[['addr','label']].to_numpy()
labels

array([['0x0000000000000000000000000000000000000000', 0],
       ['0x4b3406a41399c7fd2ba65cbc93697ad9e7ea61e5', 0],
       ['0x000000000000000000000000000000000000dead', 0],
       ...,
       ['0x6fb9a82b00ccbe0e0cf06984d902cbeb946a44ca', 0],
       ['0xa126b4ad2328055ab7c01ce87a29a17b9f97e85a', 0],
       ['0xc7d7ffb6484ac68efdbca7a477a63ce307e8fdcf', 0]], dtype=object)

In [44]:
for i in range(labels.shape[0]):
    node_dict[i] = labels[i][0]

In [45]:
node_features = dfnodes[['addr','OutCnt','OutAmount','OutValue','OutTransFee','InCnt','InAmount','InValue','InTransFee']].to_numpy()

In [46]:
features = node_features[:,1:]

In [47]:
train_ratio, val_ratio = 0.4, 0.2

nodes_anomaly = []
nodes_non_anomaly = []
for i in range(len(labels)):
    if labels[i][1]==1:
        nodes_anomaly.append(i)
    else:
        nodes_non_anomaly.append(i)

n = labels.shape[0]
train_mask = torch.zeros(n).bool()
val_mask = torch.zeros(n).bool()
test_mask = torch.zeros(n).bool()

import random
random.shuffle(nodes_anomaly)
train_ones = nodes_anomaly[:int(len(nodes_anomaly)*train_ratio)]
val_ones = nodes_anomaly[int(len(nodes_anomaly)*train_ratio):int(len(nodes_anomaly)*(train_ratio+val_ratio))]
test_ones = nodes_anomaly[int(len(nodes_anomaly)*(train_ratio+val_ratio)):]

random.shuffle(nodes_non_anomaly)
train_zeros = nodes_non_anomaly[:int(len(nodes_non_anomaly)*train_ratio)]
val_zeros = nodes_non_anomaly[int(len(nodes_non_anomaly)*train_ratio):int(len(nodes_non_anomaly)*(train_ratio+val_ratio))]
test_zeros = nodes_non_anomaly[int(len(nodes_non_anomaly)*(train_ratio+val_ratio)):]

for i in train_ones+train_zeros:
    train_mask[i] = True

for i in val_ones+val_zeros:
    val_mask[i] = True
    
for i in test_ones+test_zeros:
    test_mask[i] = True

In [48]:
dfedges = pd.read_csv(prefix+'/raw_data/tinyedges.csv')

In [49]:
dfedges

Unnamed: 0,from,to,timestamp,transferedAmount,value,transactionFee,TxnsCnt
0,0x1439b4d5a72343b68f12398c649df99d76b2af53,0x8c0a11eb047c1097c821c159b8e0c2c5f37f81bf,20220730032737,43,0.00,121.205,42
1,0x9ed3db7a8ec964ef0813edb7bf3ff514a25fae70,0xa5b36a3732937375bc579fa38159347da9938441,20220730021034,265,0.00,293.970,249
2,0xbdc4a5c0ff7275736cad102c7408555fb5d6c495,0x564a8e13d7dd23d5525160d204165bdbcb69b4db,20220729052058,43,0.00,52.096,43
3,0x5e11534344b8c1fda947b37dc57b8734232a6b1c,0xd0c877b474cd51959931a7f70d7a6c60f50cdae7,20220729042239,1,157.01,1.990,1
4,0x58e14b71ef1a30f186000c5fb4e8ab993336f109,0x25f20e56dc90c79bb80896b613dbe9dc6b96ca04,20220729023044,11,16.49,7.230,2
...,...,...,...,...,...,...,...
227322,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xe5798a530bb7105e148d38ac884f05c28ed8e804,20220521042106,1,84.28,3.300,1
227323,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x38e3d4a5bf7dea9280e389864d4bf9834cc2f266,20220521034547,1,84.28,4.530,1
227324,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x3cd378c9b1cb5f147ebf1b2c2564118946ae4ba1,20220520212804,1,84.28,5.120,1
227325,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xbc6819c533db537ad8e169d8d67e3c8971c0417f,20220520204309,2,168.51,13.460,2


In [50]:
edges = dfedges[['from','to']].to_numpy()
edges

array([['0x1439b4d5a72343b68f12398c649df99d76b2af53',
        '0x8c0a11eb047c1097c821c159b8e0c2c5f37f81bf'],
       ['0x9ed3db7a8ec964ef0813edb7bf3ff514a25fae70',
        '0xa5b36a3732937375bc579fa38159347da9938441'],
       ['0xbdc4a5c0ff7275736cad102c7408555fb5d6c495',
        '0x564a8e13d7dd23d5525160d204165bdbcb69b4db'],
       ...,
       ['0x8cb7f1a4f44593ec356b11c70c0a977c647c763c',
        '0x3cd378c9b1cb5f147ebf1b2c2564118946ae4ba1'],
       ['0x8cb7f1a4f44593ec356b11c70c0a977c647c763c',
        '0xbc6819c533db537ad8e169d8d67e3c8971c0417f'],
       ['0x8cb7f1a4f44593ec356b11c70c0a977c647c763c',
        '0x04f443bf89dae24a35f523130cabff709fb1b22c']], dtype=object)

In [51]:
new_edges = np.zeros_like(edges)

In [52]:
for i in range(edges.shape[0]):
    new_edges[i][0] = node_dict.inv[edges[i][0]]
    new_edges[i][1] = node_dict.inv[edges[i][1]]

In [53]:
graph = dgl.graph((new_edges[:,0].astype(int), new_edges[:,1].astype(int)))
graph.ndata['train_mask'] = train_mask
graph.ndata['val_mask'] = val_mask
graph.ndata['test_mask'] = test_mask
graph.ndata['label'] = torch.tensor(labels[:,1].astype(int))
graph.ndata['feature'] = torch.tensor(features.astype(float))

In [54]:
graph.edata['timestamp'] = torch.tensor(dfedges['timestamp'].astype(int))

In [55]:
efeatures = dfedges[['TxnsCnt','transferedAmount','value','transactionFee']].to_numpy()

In [56]:
graph.edata['feature'] = torch.tensor(efeatures)

In [57]:
graph

Graph(num_nodes=20000, num_edges=227327,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'label': Scheme(shape=(), dtype=torch.int64), 'feature': Scheme(shape=(8,), dtype=torch.float64)}
      edata_schemes={'timestamp': Scheme(shape=(), dtype=torch.int64), 'feature': Scheme(shape=(4,), dtype=torch.float64)})

In [58]:
dgl.save_graphs(prefix+'/datasets/dgl_graph/tinynftgraph', [graph])

In [59]:
import torch
from torch_geometric.data import Data

c = torch.stack([graph.edges()[0], graph.edges()[1]], dim=1).t().contiguous()

data = Data(x=graph.ndata['feature'],edge_index=c,y=graph.ndata['label'],train_mask=graph.ndata['train_mask'],\
            val_mask=graph.ndata['val_mask'],test_mask=graph.ndata['test_mask'],\
            edge_attr=graph.edata['feature'],etime=graph.edata['timestamp'])

In [60]:
data

Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])

In [61]:
torch.save(data,prefix+'/datasets/pyg_graph/tinynftgraph')

In [62]:
from ogb.io import DatasetSaver
from ogb.nodeproppred import NodePropPredDataset

In [63]:
dataset_name = 'ogbn-tinynftgraph'

In [64]:
saver = DatasetSaver(dataset_name = dataset_name,root=prefix+'/datasets/ogb_graph/submission', is_hetero = False, version = 1)

In [65]:
g = dict()
labels = np.array(data.y)
graph_list = []

In [66]:
data

Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])

In [67]:
# fill dict
g['num_nodes'] = int(data.num_nodes)
g['node_feat'] = np.array(data.x) # axis = 1 is column!
g['edge_index'] = np.array(data.edge_index)
g['edge_feat'] = np.array(data.edge_attr)
g['edge_time'] = np.array(data.etime)
# saving a list of graphs
graph_list.append(g)
saver.save_graph_list(graph_list)
saver.save_target_labels(labels.reshape(-1,1))

dict_keys(['num_nodes', 'node_feat', 'edge_index', 'edge_feat', 'edge_time'])
Saving edge_index
Saving all the files!
Validating...
Reading saved files
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 6786.90it/s]


Checking read graphs and given graphs are the same


100%|██████████| 1/1 [00:00<00:00, 15.53it/s]


In [68]:
split_idx = dict()
num_data = len(labels)
split_idx['train'] = data.train_mask.nonzero().squeeze()
split_idx['valid'] = data.val_mask.nonzero().squeeze()
split_idx['test'] = data.test_mask.nonzero().squeeze()
saver.save_split(split_idx, split_name = 'random')

In [69]:
mapping_path = prefix+'/datasets/mapping'
os.makedirs(mapping_path,exist_ok=True)
try:
    os.mknod(os.path.join(mapping_path, 'README.md'))
except:
    print("Readme.md already exists.")
saver.copy_mapping_dir(mapping_path)

In [70]:
saver.save_task_info(task_type = 'binary classification', eval_metric = 'rocauc', num_classes = 2)

binary classification
2


In [71]:
meta_dict = saver.get_meta_dict()

In [72]:
saver.zip()
saver.cleanup()

In [73]:
filedir = prefix+'/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph.zip'
dstdirs = prefix+'/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph'

In [74]:
!unzip $filedir -d $dstdirs

Archive:  /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph.zip
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/mapping/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/processed/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/raw/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/split/
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/RELEASE_v1.txt  
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/mapping/README.md  
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/split/random/
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/split/random/split_dict.pt  
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submissio

In [75]:
dataset = NodePropPredDataset(dataset_name,root=prefix+'/dataset/',meta_dict = meta_dict)

Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 6307.22it/s]

Saving...





In [76]:
split_edge = dataset.get_idx_split()