In [1]:
import dgl
import torch
import numpy as np
import os
import random
import pandas as pd
import bidict

def set_seed(seed=3407):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

In [2]:
prefix = '/data/sx/NFTGraph'

In [3]:
dfnodes = pd.read_csv(prefix+'/raw_data/tinynodes.csv')

In [4]:
dfnodes

Unnamed: 0,addr,OutCnt,OutAmount,OutValue,OutTransFee,InCnt,InAmount,InValue,InTransFee,label
0,0x0000000000000000000000000000000000000000,2650186.0,7217724.0,7.123250e+08,2.160977e+07,595830.0,2410795.0,2.029615e+07,4.879685e+06,0
1,0x4b3406a41399c7fd2ba65cbc93697ad9e7ea61e5,20407.0,22644.0,0.000000e+00,1.306836e+05,20407.0,22644.0,0.000000e+00,1.306836e+05,0
2,0x000000000000000000000000000000000000dead,6.0,12.0,0.000000e+00,2.842000e+01,39553.0,47223.0,1.036783e+05,1.821448e+05,0
3,0xeca9d81a4dc7119a40481cff4e7e24dd0aaf56bd,1090.0,1090.0,0.000000e+00,1.274590e+03,35295.0,36508.0,0.000000e+00,2.018945e+04,0
4,0x381e840f4ebe33d0153e9a312105554594a98c42,34524.0,36592.0,0.000000e+00,2.083176e+05,442.0,2494.0,0.000000e+00,2.374280e+03,0
...,...,...,...,...,...,...,...,...,...,...
19995,0x5a1d741b79d30c33021cf9d07ab59a37a10b4d1f,11.0,17.0,2.264330e+03,2.911300e+02,52.0,3320.0,2.773610e+03,2.299100e+02,0
19996,0x479db4dac1f196bceb97d52e99c3f4959d93b18b,19.0,23.0,1.110640e+03,1.290900e+02,44.0,44.0,0.000000e+00,1.353200e+02,0
19997,0x6fb9a82b00ccbe0e0cf06984d902cbeb946a44ca,17.0,17.0,4.199727e+03,2.130217e+02,46.0,46.0,3.892785e+03,3.892402e+02,0
19998,0xa126b4ad2328055ab7c01ce87a29a17b9f97e85a,15.0,116.0,1.841844e+04,2.348217e+02,48.0,57.0,9.970242e+03,2.797561e+02,0


In [5]:
node_dict = bidict.bidict()

In [6]:
labels = dfnodes[['addr','label']].to_numpy()
labels

array([['0x0000000000000000000000000000000000000000', 0],
       ['0x4b3406a41399c7fd2ba65cbc93697ad9e7ea61e5', 0],
       ['0x000000000000000000000000000000000000dead', 0],
       ...,
       ['0x6fb9a82b00ccbe0e0cf06984d902cbeb946a44ca', 0],
       ['0xa126b4ad2328055ab7c01ce87a29a17b9f97e85a', 0],
       ['0xc7d7ffb6484ac68efdbca7a477a63ce307e8fdcf', 0]], dtype=object)

In [7]:
for i in range(labels.shape[0]):
    node_dict[i] = labels[i][0]

In [8]:
node_features = dfnodes[['addr','OutCnt','OutAmount','OutValue','OutTransFee','InCnt','InAmount','InValue','InTransFee']].to_numpy()

In [9]:
features = node_features[:,1:]

In [10]:
train_ratio, val_ratio = 0.4, 0.2

nodes_anomaly = []
nodes_non_anomaly = []
for i in range(len(labels)):
    if labels[i][1]==1:
        nodes_anomaly.append(i)
    else:
        nodes_non_anomaly.append(i)

n = labels.shape[0]
train_mask = torch.zeros(n).bool()
val_mask = torch.zeros(n).bool()
test_mask = torch.zeros(n).bool()

import random
random.shuffle(nodes_anomaly)
train_ones = nodes_anomaly[:int(len(nodes_anomaly)*train_ratio)]
val_ones = nodes_anomaly[int(len(nodes_anomaly)*train_ratio):int(len(nodes_anomaly)*(train_ratio+val_ratio))]
test_ones = nodes_anomaly[int(len(nodes_anomaly)*(train_ratio+val_ratio)):]

random.shuffle(nodes_non_anomaly)
train_zeros = nodes_non_anomaly[:int(len(nodes_non_anomaly)*train_ratio)]
val_zeros = nodes_non_anomaly[int(len(nodes_non_anomaly)*train_ratio):int(len(nodes_non_anomaly)*(train_ratio+val_ratio))]
test_zeros = nodes_non_anomaly[int(len(nodes_non_anomaly)*(train_ratio+val_ratio)):]

for i in train_ones+train_zeros:
    train_mask[i] = True

for i in val_ones+val_zeros:
    val_mask[i] = True
    
for i in test_ones+test_zeros:
    test_mask[i] = True

In [11]:
dfedges = pd.read_csv(prefix+'/raw_data/tinyedges.csv')

In [12]:
dfedges

Unnamed: 0,from,to,timestamp,transferedAmount,value,transactionFee,TxnsCnt
0,0x1439b4d5a72343b68f12398c649df99d76b2af53,0x8c0a11eb047c1097c821c159b8e0c2c5f37f81bf,20220730032737,43,0.00,121.205,42
1,0x9ed3db7a8ec964ef0813edb7bf3ff514a25fae70,0xa5b36a3732937375bc579fa38159347da9938441,20220730021034,265,0.00,293.970,249
2,0xbdc4a5c0ff7275736cad102c7408555fb5d6c495,0x564a8e13d7dd23d5525160d204165bdbcb69b4db,20220729052058,43,0.00,52.096,43
3,0x5e11534344b8c1fda947b37dc57b8734232a6b1c,0xd0c877b474cd51959931a7f70d7a6c60f50cdae7,20220729042239,1,157.01,1.990,1
4,0x58e14b71ef1a30f186000c5fb4e8ab993336f109,0x25f20e56dc90c79bb80896b613dbe9dc6b96ca04,20220729023044,11,16.49,7.230,2
...,...,...,...,...,...,...,...
227322,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xe5798a530bb7105e148d38ac884f05c28ed8e804,20220521042106,1,84.28,3.300,1
227323,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x38e3d4a5bf7dea9280e389864d4bf9834cc2f266,20220521034547,1,84.28,4.530,1
227324,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x3cd378c9b1cb5f147ebf1b2c2564118946ae4ba1,20220520212804,1,84.28,5.120,1
227325,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xbc6819c533db537ad8e169d8d67e3c8971c0417f,20220520204309,2,168.51,13.460,2


In [13]:
edges = dfedges[['from','to']].to_numpy()
edges

array([['0x1439b4d5a72343b68f12398c649df99d76b2af53',
        '0x8c0a11eb047c1097c821c159b8e0c2c5f37f81bf'],
       ['0x9ed3db7a8ec964ef0813edb7bf3ff514a25fae70',
        '0xa5b36a3732937375bc579fa38159347da9938441'],
       ['0xbdc4a5c0ff7275736cad102c7408555fb5d6c495',
        '0x564a8e13d7dd23d5525160d204165bdbcb69b4db'],
       ...,
       ['0x8cb7f1a4f44593ec356b11c70c0a977c647c763c',
        '0x3cd378c9b1cb5f147ebf1b2c2564118946ae4ba1'],
       ['0x8cb7f1a4f44593ec356b11c70c0a977c647c763c',
        '0xbc6819c533db537ad8e169d8d67e3c8971c0417f'],
       ['0x8cb7f1a4f44593ec356b11c70c0a977c647c763c',
        '0x04f443bf89dae24a35f523130cabff709fb1b22c']], dtype=object)

In [14]:
new_edges = np.zeros_like(edges)

In [15]:
for i in range(edges.shape[0]):
    new_edges[i][0] = node_dict.inv[edges[i][0]]
    new_edges[i][1] = node_dict.inv[edges[i][1]]

In [16]:
graph = dgl.graph((new_edges[:,0].astype(int), new_edges[:,1].astype(int)))
graph.ndata['train_mask'] = train_mask
graph.ndata['val_mask'] = val_mask
graph.ndata['test_mask'] = test_mask
graph.ndata['label'] = torch.tensor(labels[:,1].astype(int))
graph.ndata['feature'] = torch.tensor(features.astype(float))

In [17]:
graph.edata['timestamp'] = torch.tensor(dfedges['timestamp'].astype(int))

In [18]:
efeatures = dfedges[['TxnsCnt','transferedAmount','value','transactionFee']].to_numpy()

In [19]:
graph.edata['feature'] = torch.tensor(efeatures)

In [20]:
graph

Graph(num_nodes=20000, num_edges=227327,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'label': Scheme(shape=(), dtype=torch.int64), 'feature': Scheme(shape=(8,), dtype=torch.float64)}
      edata_schemes={'timestamp': Scheme(shape=(), dtype=torch.int64), 'feature': Scheme(shape=(4,), dtype=torch.float64)})

In [21]:
dgl.save_graphs(prefix+'/datasets/dgl_graph/tinynftgraph.bin', [graph])

In [22]:
import torch
from torch_geometric.data import Data

c = torch.stack([graph.edges()[0], graph.edges()[1]], dim=1).t().contiguous()

data = Data(x=graph.ndata['feature'],edge_index=c,y=graph.ndata['label'],train_mask=graph.ndata['train_mask'],\
            val_mask=graph.ndata['val_mask'],test_mask=graph.ndata['test_mask'],\
            edge_attr=graph.edata['feature'],etime=graph.edata['timestamp'])

In [23]:
data

Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])

In [24]:
torch.save(data,prefix+'/datasets/pyg_graph/tinynftgraph.bin')

In [25]:
from ogb.io import DatasetSaver
from ogb.nodeproppred import NodePropPredDataset

In [26]:
dataset_name = 'ogbn-tinynftgraph'

In [64]:
saver = DatasetSaver(dataset_name = dataset_name,root=prefix+'/datasets/ogb_graph/submission', is_hetero = False, version = 1)

In [65]:
g = dict()
labels = np.array(data.y)
graph_list = []

In [66]:
data

Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])

In [67]:
# fill dict
g['num_nodes'] = int(data.num_nodes)
g['node_feat'] = np.array(data.x) # axis = 1 is column!
g['edge_index'] = np.array(data.edge_index)
g['edge_feat'] = np.array(data.edge_attr)
g['edge_time'] = np.array(data.etime)
# saving a list of graphs
graph_list.append(g)
saver.save_graph_list(graph_list)
saver.save_target_labels(labels.reshape(-1,1))

dict_keys(['num_nodes', 'node_feat', 'edge_index', 'edge_feat', 'edge_time'])
Saving edge_index
Saving all the files!
Validating...
Reading saved files
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 6786.90it/s]


Checking read graphs and given graphs are the same


100%|██████████| 1/1 [00:00<00:00, 15.53it/s]


In [68]:
split_idx = dict()
num_data = len(labels)
split_idx['train'] = data.train_mask.nonzero().squeeze()
split_idx['valid'] = data.val_mask.nonzero().squeeze()
split_idx['test'] = data.test_mask.nonzero().squeeze()
saver.save_split(split_idx, split_name = 'random')

In [69]:
mapping_path = prefix+'/datasets/mapping'
os.makedirs(mapping_path,exist_ok=True)
try:
    os.mknod(os.path.join(mapping_path, 'README.md'))
except:
    print("Readme.md already exists.")
saver.copy_mapping_dir(mapping_path)

In [70]:
saver.save_task_info(task_type = 'binary classification', eval_metric = 'rocauc', num_classes = 2)

binary classification
2


In [71]:
meta_dict = saver.get_meta_dict()

In [72]:
saver.zip()
saver.cleanup()

In [73]:
filedir = prefix+'/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph.zip'
dstdirs = prefix+'/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph'

In [74]:
!unzip $filedir -d $dstdirs

Archive:  /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph.zip
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/mapping/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/processed/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/raw/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/split/
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/RELEASE_v1.txt  
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/mapping/README.md  
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/split/random/
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbn_tinynftgraph/tinynftgraph/split/random/split_dict.pt  
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submissio

In [75]:
dataset = NodePropPredDataset(dataset_name,root=prefix+'/dataset/',meta_dict = meta_dict)

Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 6307.22it/s]

Saving...





In [76]:
split_edge = dataset.get_idx_split()

In [1]:
from ogb.io import DatasetSaver
from ogb.linkproppred import LinkPropPredDataset

In [6]:
data = torch.load(prefix+'/datasets/pyg_graph/tinynftgraph')
data

Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])

In [7]:
dataset_name = 'ogbl-tinynftgraph'

In [8]:
saver = DatasetSaver(dataset_name = dataset_name,root=prefix+'/datasets/ogb_graph/submission', is_hetero = False, version = 1)

In [9]:
data

Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])

In [10]:
print(f'Dataset: {dataset_name}:')
print('======================')
print(f'data: {data}')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.edge_index.shape[1]}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is directed: {data.is_directed()}')

Dataset: ogbl-tinynftgraph:
data: Data(x=[20000, 8], edge_index=[2, 227327], edge_attr=[227327, 4], y=[20000], train_mask=[20000], val_mask=[20000], test_mask=[20000], etime=[227327])
Number of nodes: 20000
Number of edges: 227327
Average node degree: 11.37
Contains isolated nodes: True
Contains self-loops: True


Is directed: True


In [11]:
graph_list = []

graph = dict()
graph['num_nodes'] = int(data.num_nodes)
graph['node_feat'] = np.array(data.x)
graph['edge_index'] = data.edge_index.numpy() # only train pos edge index, but both directions / undirected!
graph['edge_feat'] = data.edge_attr.numpy()
graph_list.append(graph)

print(graph_list)
# saving a list of graphs
saver.save_graph_list(graph_list)

[{'num_nodes': 20000, 'node_feat': array([[2.65018600e+06, 7.21772400e+06, 7.12324960e+08, ...,
        2.41079500e+06, 2.02961487e+07, 4.87968517e+06],
       [2.04070000e+04, 2.26440000e+04, 0.00000000e+00, ...,
        2.26440000e+04, 0.00000000e+00, 1.30683555e+05],
       [6.00000000e+00, 1.20000000e+01, 0.00000000e+00, ...,
        4.72230000e+04, 1.03678274e+05, 1.82144848e+05],
       ...,
       [1.70000000e+01, 1.70000000e+01, 4.19972667e+03, ...,
        4.60000000e+01, 3.89278500e+03, 3.89240212e+02],
       [1.50000000e+01, 1.16000000e+02, 1.84184450e+04, ...,
        5.70000000e+01, 9.97024194e+03, 2.79756077e+02],
       [4.00000000e+00, 4.00000000e+00, 1.98760000e+02, ...,
        6.00000000e+01, 1.21701925e+04, 5.53641076e+02]]), 'edge_index': array([[ 9554,   407,  1325, ..., 13726, 13726, 13726],
       [ 7525,  2972, 16808, ..., 15314,  6856,  8861]]), 'edge_feat': array([[ 42.   ,  43.   ,   0.   , 121.205],
       [249.   , 265.   ,   0.   , 293.97 ],
       [ 43.

Validating...
Reading saved files
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 5289.16it/s]


Checking read graphs and given graphs are the same


100%|██████████| 1/1 [00:00<00:00, 10.65it/s]


In [12]:
import math
def custom_train_test_split_edges(data, val_ratio: float = 0.05, test_ratio: float = 0.1):
    r"""Splits the edges of a :class:`torch_geometric.data.Data` object
    into positive and negative train/val/test edges.
    As such, it will replace the :obj:`edge_index` attribute with
    :obj:`train_pos_edge_index`, :obj:`train_pos_neg_adj_mask`,
    :obj:`val_pos_edge_index`, :obj:`val_neg_edge_index` and
    :obj:`test_pos_edge_index` attributes.
    If :obj:`data` has edge features named :obj:`edge_attr`, then
    :obj:`train_pos_edge_attr`, :obj:`val_pos_edge_attr` and
    :obj:`test_pos_edge_attr` will be added as well.

    Args:
        data (Data): The data object.
        val_ratio (float, optional): The ratio of positive validation edges.
            (default: :obj:`0.05`)
        test_ratio (float, optional): The ratio of positive test edges.
            (default: :obj:`0.1`)

    :rtype: :class:`torch_geometric.data.Data`
    """

    assert 'batch' not in data  # No batch-mode.

    num_nodes = data.num_nodes
    original_edge_index = data.edge_index
    row, col = data.edge_index
    edge_attr = data.edge_attr
    data.edge_index = data.edge_attr = None

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    if edge_attr is not None:
        edge_attr = edge_attr[mask]

    n_v = int(math.floor(val_ratio * row.size(0)))
    n_t = int(math.floor(test_ratio * row.size(0)))

    # Positive edges.
    perm = torch.randperm(row.size(0))
    row, col = row[perm], col[perm]
    if edge_attr is not None:
        edge_attr = edge_attr[perm]

    r, c = row[:n_v], col[:n_v]
    data.val_pos_edge_index = torch.stack([r, c], dim=0)
    if edge_attr is not None:
        data.val_pos_edge_attr = edge_attr[:n_v]

    r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)
    if edge_attr is not None:
        data.test_pos_edge_attr = edge_attr[n_v:n_v + n_t]

    r, c = row[n_v + n_t:], col[n_v + n_t:]

    # this section is custom
    # -----------------------
    data.train_pos_edge_index = torch.stack([r, c], dim=0)

    helper = data.train_pos_edge_index

    # if edge_attr is not None:
    #     out = to_undirected(data.train_pos_edge_index, edge_attr[n_v + n_t:])
    #     data.edge_index, data.edge_attr = out
    # else:
    #     data.edge_index = to_undirected(data.train_pos_edge_index)

    data.train_pos_edge_index = helper

    if edge_attr is not None:
        data.train_pos_edge_attr = edge_attr[n_v + n_t:]
    # -----------------------

    data.edge_index = original_edge_index

    
    # generate negative edge list by randomly sampling the nodes!
    neg_edge_list = np.array(np.random.randint(low=0, high=num_nodes,
                                               size=(2*data.edge_index.shape[1],)). # left and right edge - 2x, to be safe:3.4
                             reshape((data.edge_index.shape[1],2)))

    a = np.min(neg_edge_list, axis=1)
    b = np.max(neg_edge_list, axis=1)

    neg_edge_list = np.vstack((a,b)).transpose()

    # filter for unique edges in the negative edge list

    # obtain the indexes of the first occuring objects
    # _, indices = np.unique(edges[:,[0,1]],return_index=True,axis=0)
    _, indices = np.unique(neg_edge_list[:,[0,1]],return_index=True,axis=0)

    neg_edge_list = neg_edge_list[indices]

    all_edges = np.concatenate((np.array(data.edge_index.t()),neg_edge_list), axis=0) # concat positive edges of graph and negative edges

    # obtain the indexes of unique objects
    _, indices = np.unique(all_edges[:, [0, 1]], return_index=True, axis=0)

    # sort indices

    indices = np.sort(indices)
    indices = indices[indices > data.edge_index.shape[1]] # remove the indices of the positive edges!
    neg_edge_list = torch.tensor(all_edges[indices])

    # sample edges according to percentage

    ind = torch.randperm(neg_edge_list.shape[0])

    data.val_neg_edge_index = neg_edge_list[ind[:n_v]].t()
    data.test_neg_edge_index = neg_edge_list[ind[n_v:n_v+n_t]].t()
    data.train_neg_edge_index = neg_edge_list[ind[n_v+n_t:n_v+n_t+data.train_pos_edge_index.shape[1]]].t()

    """
    #Original Sampling: allocates to much memory

    # Negative edges.
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0

    neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t()
    ind = torch.randperm(neg_row.size(0))
    perm = ind[:n_v + n_t]
    perm_train = ind[n_v+n_t:n_v+n_t+data.train_pos_edge_index.shape[1]]
    neg_row_train, neg_col_train = neg_row[perm_train], neg_col[perm_train]
    neg_row, neg_col = neg_row[perm], neg_col[perm]

    neg_adj_mask[neg_row, neg_col] = 0
    data.train_neg_adj_mask = neg_adj_mask

    row, col = neg_row[:n_v], neg_col[:n_v]
    data.val_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row[n_v:n_v + n_t], neg_col[n_v:n_v + n_t]
    data.test_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row_train , neg_col_train
    data.train_neg_edge_index = torch.stack([row, col], dim=0)
    """

    return data

In [13]:
val_ratio = 0.2
test_ratio = 0.4

In [14]:
data = custom_train_test_split_edges(data, val_ratio=val_ratio, test_ratio = test_ratio)

In [15]:
del data.train_mask,data.val_mask,data.test_mask

In [16]:
data

Data(x=[20000, 8], y=[20000], etime=[227327], val_pos_edge_index=[2, 28776], val_pos_edge_attr=[28776, 4], test_pos_edge_index=[2, 57553], test_pos_edge_attr=[57553, 4], train_pos_edge_index=[2, 57554], train_pos_edge_attr=[57554, 4], edge_index=[2, 227327], val_neg_edge_index=[2, 28776], test_neg_edge_index=[2, 57553], train_neg_edge_index=[2, 57554])

In [17]:
split_edge = {'train': {}, 'valid': {}, 'test': {}}

In [18]:
split_edge['train']['edge'] = data.train_pos_edge_index.t() # these are only one directional
split_edge['train']['edge_neg'] = data.train_neg_edge_index.t() # these are only one directional
split_edge['valid']['edge'] = data.val_pos_edge_index.t() # these are only one directional
split_edge['valid']['edge_neg'] = data.val_neg_edge_index.t()  # these are only one directional
split_edge['test']['edge'] = data.test_pos_edge_index.t()  # these are only one directional
split_edge['test']['edge_neg'] = data.test_neg_edge_index.t()  # these are only one directional

In [19]:
split_edge['train']['edge_attr'] = data.train_pos_edge_attr.t() # these are only one directional
split_edge['test']['edge_attr'] = data.test_pos_edge_attr.t() # these are only one directional
split_edge['valid']['edge_attr'] = data.val_pos_edge_attr.t() # these are only one directional

In [20]:
saver.save_split(split_edge, split_name = 'random')

In [21]:
split_edge

{'train': {'edge': tensor([[    0,  8129],
          [ 7942, 16786],
          [  136,  2093],
          ...,
          [  167, 17092],
          [    0,   723],
          [ 1317,  2311]]),
  'edge_neg': tensor([[10825, 19296],
          [12958, 19563],
          [15646, 16584],
          ...,
          [12952, 14736],
          [ 6187, 18765],
          [  129, 11565]]),
  'edge_attr': tensor([[3.0000e+00, 2.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 2.9000e+01,
           1.0000e+00],
          [1.6000e+01, 2.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 4.9000e+02,
           1.0000e+00],
          [0.0000e+00, 5.5200e+01, 1.9400e-01,  ..., 0.0000e+00, 4.6522e+02,
           1.1069e+03],
          [2.6710e+01, 4.2680e+01, 3.5878e+00,  ..., 1.7800e+00, 2.2647e+02,
           2.7760e+01]], dtype=torch.float64)},
 'valid': {'edge': tensor([[  668, 18719],
          [ 3563,  8939],
          [   53, 16962],
          ...,
          [  204,  6487],
          [ 6001, 13667],
          [   93, 1909

In [22]:
mapping_path = prefix+'/mapping/'

# prepare mapping information first and store it under this directory (empty below).
os.makedirs(mapping_path,exist_ok=True)
try:
    os.mknod(os.path.join(mapping_path, 'README.md'))
except:
    print("Readme.md already exists.")
saver.copy_mapping_dir(mapping_path)

Readme.md already exists.


In [23]:
saver.save_task_info(task_type = 'link prediction', eval_metric = 'mrr')

link prediction
None


In [24]:
meta_dict = saver.get_meta_dict()
print(meta_dict)

{'version': 1, 'dir_path': '/data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph', 'binary': 'True', 'task type': 'link prediction', 'eval metric': 'mrr', 'add_inverse_edge': 'False', 'split': 'random', 'download_name': 'tinynftgraph', 'url': 'https://snap.stanford.edu/ogb/data/linkproppred/tinynftgraph.zip', 'has_node_attr': 'True', 'has_edge_attr': 'True', 'additional node files': 'None', 'additional edge files': 'None', 'is hetero': 'False'}


In [25]:
saver.zip()
saver.cleanup()

In [28]:
filedir = prefix+'/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph.zip'
dstdirs = prefix+'/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph'

In [29]:
!unzip $filedir -d $dstdirs

Archive:  /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph.zip
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/mapping/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/processed/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/raw/
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/split/
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/RELEASE_v1.txt  
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/mapping/README.md  
   creating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/split/random/
  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/split/random/split_dict.pt  


  inflating: /data/sx/NFTGraph/datasets/ogb_graph/submission_ogbl_tinynftgraph/tinynftgraph/raw/data.npz  


In [30]:
dataset = LinkPropPredDataset(dataset_name,root=prefix+'/datasets/', meta_dict = meta_dict)

Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 6990.51it/s]

Saving...





In [31]:
dataset.get_edge_split()

{'train': {'edge': tensor([[    0,  8129],
          [ 7942, 16786],
          [  136,  2093],
          ...,
          [  167, 17092],
          [    0,   723],
          [ 1317,  2311]]),
  'edge_neg': tensor([[10825, 19296],
          [12958, 19563],
          [15646, 16584],
          ...,
          [12952, 14736],
          [ 6187, 18765],
          [  129, 11565]]),
  'edge_attr': tensor([[3.0000e+00, 2.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 2.9000e+01,
           1.0000e+00],
          [1.6000e+01, 2.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 4.9000e+02,
           1.0000e+00],
          [0.0000e+00, 5.5200e+01, 1.9400e-01,  ..., 0.0000e+00, 4.6522e+02,
           1.1069e+03],
          [2.6710e+01, 4.2680e+01, 3.5878e+00,  ..., 1.7800e+00, 2.2647e+02,
           2.7760e+01]], dtype=torch.float64)},
 'valid': {'edge': tensor([[  668, 18719],
          [ 3563,  8939],
          [   53, 16962],
          ...,
          [  204,  6487],
          [ 6001, 13667],
          [   93, 1909

In [36]:
dfedges

Unnamed: 0,from,to,timestamp,transferedAmount,value,transactionFee,TxnsCnt
0,0x1439b4d5a72343b68f12398c649df99d76b2af53,0x8c0a11eb047c1097c821c159b8e0c2c5f37f81bf,20220730032737,43,0.00,121.205,42
1,0x9ed3db7a8ec964ef0813edb7bf3ff514a25fae70,0xa5b36a3732937375bc579fa38159347da9938441,20220730021034,265,0.00,293.970,249
2,0xbdc4a5c0ff7275736cad102c7408555fb5d6c495,0x564a8e13d7dd23d5525160d204165bdbcb69b4db,20220729052058,43,0.00,52.096,43
3,0x5e11534344b8c1fda947b37dc57b8734232a6b1c,0xd0c877b474cd51959931a7f70d7a6c60f50cdae7,20220729042239,1,157.01,1.990,1
4,0x58e14b71ef1a30f186000c5fb4e8ab993336f109,0x25f20e56dc90c79bb80896b613dbe9dc6b96ca04,20220729023044,11,16.49,7.230,2
...,...,...,...,...,...,...,...
227322,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xe5798a530bb7105e148d38ac884f05c28ed8e804,20220521042106,1,84.28,3.300,1
227323,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x38e3d4a5bf7dea9280e389864d4bf9834cc2f266,20220521034547,1,84.28,4.530,1
227324,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x3cd378c9b1cb5f147ebf1b2c2564118946ae4ba1,20220520212804,1,84.28,5.120,1
227325,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xbc6819c533db537ad8e169d8d67e3c8971c0417f,20220520204309,2,168.51,13.460,2


In [37]:
tgb_edges = dfedges[['timestamp','from','to','TxnsCnt']]

In [38]:
tgb_edges

Unnamed: 0,timestamp,from,to,TxnsCnt
0,20220730032737,0x1439b4d5a72343b68f12398c649df99d76b2af53,0x8c0a11eb047c1097c821c159b8e0c2c5f37f81bf,42
1,20220730021034,0x9ed3db7a8ec964ef0813edb7bf3ff514a25fae70,0xa5b36a3732937375bc579fa38159347da9938441,249
2,20220729052058,0xbdc4a5c0ff7275736cad102c7408555fb5d6c495,0x564a8e13d7dd23d5525160d204165bdbcb69b4db,43
3,20220729042239,0x5e11534344b8c1fda947b37dc57b8734232a6b1c,0xd0c877b474cd51959931a7f70d7a6c60f50cdae7,1
4,20220729023044,0x58e14b71ef1a30f186000c5fb4e8ab993336f109,0x25f20e56dc90c79bb80896b613dbe9dc6b96ca04,2
...,...,...,...,...
227322,20220521042106,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xe5798a530bb7105e148d38ac884f05c28ed8e804,1
227323,20220521034547,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x38e3d4a5bf7dea9280e389864d4bf9834cc2f266,1
227324,20220520212804,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0x3cd378c9b1cb5f147ebf1b2c2564118946ae4ba1,1
227325,20220520204309,0x8cb7f1a4f44593ec356b11c70c0a977c647c763c,0xbc6819c533db537ad8e169d8d67e3c8971c0417f,2


In [39]:
tgb_edges.to_csv(prefix+'/datasets/tgb_graph/tgbl_tinynftgraph/tgbl-tinynftgraph_edgelist.csv',index=False)