In [None]:
import dgl
import torch
import numpy as np
import os
import random
import pandas as pd
import bidict

def set_seed(seed=3407):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

In [None]:
prefix = '/data/sx/NFTGraph'

In [None]:
dfnodes = pd.read_csv(prefix+'/raw_data/nodes.csv')

In [None]:
dfnodes

In [None]:
node_dict = bidict.bidict()

In [None]:
labels = dfnodes[['addr','label']].to_numpy()
labels

In [None]:
for i in range(labels.shape[0]):
    node_dict[i] = labels[i][0]

In [None]:
node_features = dfnodes[['addr','OutCnt','OutAmount','OutValue','OutTransFee','InCnt','InAmount','InValue','InTransFee']].to_numpy()

In [None]:
features = node_features[:,1:]

In [None]:
train_ratio, val_ratio = 0.4, 0.2

nodes_anomaly = []
nodes_non_anomaly = []
for i in range(len(labels)):
    if labels[i][1]==1:
        nodes_anomaly.append(i)
    else:
        nodes_non_anomaly.append(i)

n = labels.shape[0]
train_mask = torch.zeros(n).bool()
val_mask = torch.zeros(n).bool()
test_mask = torch.zeros(n).bool()

import random
random.shuffle(nodes_anomaly)
train_ones = nodes_anomaly[:int(len(nodes_anomaly)*train_ratio)]
val_ones = nodes_anomaly[int(len(nodes_anomaly)*train_ratio):int(len(nodes_anomaly)*(train_ratio+val_ratio))]
test_ones = nodes_anomaly[int(len(nodes_anomaly)*(train_ratio+val_ratio)):]

random.shuffle(nodes_non_anomaly)
train_zeros = nodes_non_anomaly[:int(len(nodes_non_anomaly)*train_ratio)]
val_zeros = nodes_non_anomaly[int(len(nodes_non_anomaly)*train_ratio):int(len(nodes_non_anomaly)*(train_ratio+val_ratio))]
test_zeros = nodes_non_anomaly[int(len(nodes_non_anomaly)*(train_ratio+val_ratio)):]

for i in train_ones+train_zeros:
    train_mask[i] = True

for i in val_ones+val_zeros:
    val_mask[i] = True
    
for i in test_ones+test_zeros:
    test_mask[i] = True

In [None]:
dfedges = pd.read_csv(prefix+'/raw_data/edges.csv')

In [None]:
dfedges

In [None]:
edges = dfedges[['from','to']].to_numpy()
edges

In [None]:
new_edges = np.zeros_like(edges)

In [None]:
for i in range(edges.shape[0]):
    new_edges[i][0] = node_dict.inv[edges[i][0]]
    new_edges[i][1] = node_dict.inv[edges[i][1]]

In [None]:
graph = dgl.graph((new_edges[:,0].astype(int), new_edges[:,1].astype(int)))
graph.ndata['train_mask'] = train_mask
graph.ndata['val_mask'] = val_mask
graph.ndata['test_mask'] = test_mask
graph.ndata['label'] = torch.tensor(labels[:,1].astype(int))
graph.ndata['feature'] = torch.tensor(features.astype(float))

In [None]:
graph.edata['timestamp'] = torch.tensor(dfedges['timestamp'].astype(int))

In [None]:
efeatures = dfedges[['TxnsCnt','transferedAmount','value','transactionFee']].to_numpy()

In [None]:
graph.edata['feature'] = torch.tensor(efeatures)

In [None]:
graph

In [None]:
dgl.save_graphs(prefix+'/datasets/dgl_graph/nftgraph', [graph])

In [None]:
import torch
from torch_geometric.data import Data

c = torch.stack([graph.edges()[0], graph.edges()[1]], dim=1).t().contiguous()

data = Data(x=graph.ndata['feature'],edge_index=c,y=graph.ndata['label'],train_mask=graph.ndata['train_mask'],\
            val_mask=graph.ndata['val_mask'],test_mask=graph.ndata['test_mask'],\
            edge_attr=graph.edata['feature'],etime=graph.edata['timestamp'])

In [None]:
data

In [None]:
torch.save(data,prefix+'/datasets/pyg_graph/nftgraph')

In [None]:
data = torch.load(prefix+'/datasets/pyg_graph/nftgraph')
data

In [None]:
from ogb.io import DatasetSaver
from ogb.nodeproppred import NodePropPredDataset

In [None]:
dataset_name = 'ogbn-nftgraph'

In [None]:
saver = DatasetSaver(dataset_name = dataset_name,root=prefix+'/datasets/ogb_graph/submission', is_hetero = False, version = 1)

In [None]:
g = dict()
labels = np.array(data.y)
graph_list = []

In [None]:
data

In [None]:
# fill dict
g['num_nodes'] = int(data.num_nodes)
g['node_feat'] = np.array(data.x) # axis = 1 is column!
g['edge_index'] = np.array(data.edge_index)
g['edge_feat'] = np.array(data.edge_attr)
g['edge_time'] = np.array(data.etime)
# saving a list of graphs
graph_list.append(g)
saver.save_graph_list(graph_list)
saver.save_target_labels(labels.reshape(-1,1))

In [None]:
split_idx = dict()
num_data = len(labels)
split_idx['train'] = data.train_mask.nonzero().squeeze()
split_idx['valid'] = data.val_mask.nonzero().squeeze()
split_idx['test'] = data.test_mask.nonzero().squeeze()
saver.save_split(split_idx, split_name = 'random')

In [None]:
mapping_path = prefix+'/datasets/mapping'
os.makedirs(mapping_path,exist_ok=True)
try:
    os.mknod(os.path.join(mapping_path, 'README.md'))
except:
    print("Readme.md already exists.")
saver.copy_mapping_dir(mapping_path)

In [None]:
saver.save_task_info(task_type = 'binary classification', eval_metric = 'rocauc', num_classes = 2)

In [None]:
meta_dict = saver.get_meta_dict()

In [None]:
saver.zip()
saver.cleanup()

In [None]:
filedir = prefix+'/datasets/ogb_graph/submission_ogbn_nftgraph/nftgraph.zip'
dstdirs = prefix+'/datasets/ogb_graph/submission_ogbn_nftgraph/nftgraph'

In [None]:
!unzip $filedir -d $dstdirs

In [None]:
dataset = NodePropPredDataset(dataset_name,root=prefix+'/dataset/',meta_dict = meta_dict)

In [None]:
split_edge = dataset.get_idx_split()

In [None]:
from ogb.io import DatasetSaver
from ogb.linkproppred import LinkPropPredDataset

In [None]:
dataset_name = 'ogbl-nftgraph'

In [None]:
saver = DatasetSaver(dataset_name = dataset_name,root=prefix+'/datasets/ogb_graph/submission', is_hetero = False, version = 1)

In [None]:
data

In [None]:
print(f'Dataset: {dataset_name}:')
print('======================')
print(f'data: {data}')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.edge_index.shape[1]}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is directed: {data.is_directed()}')

In [None]:
graph_list = []

graph = dict()
graph['num_nodes'] = int(data.num_nodes)
graph['node_feat'] = np.array(data.x)
graph['edge_index'] = data.edge_index.numpy() # only train pos edge index, but both directions / undirected!
graph['edge_feat'] = data.edge_attr.numpy()
graph_list.append(graph)

print(graph_list)
# saving a list of graphs
saver.save_graph_list(graph_list)

In [None]:
import math
def custom_train_test_split_edges(data, val_ratio: float = 0.05, test_ratio: float = 0.1):
    r"""Splits the edges of a :class:`torch_geometric.data.Data` object
    into positive and negative train/val/test edges.
    As such, it will replace the :obj:`edge_index` attribute with
    :obj:`train_pos_edge_index`, :obj:`train_pos_neg_adj_mask`,
    :obj:`val_pos_edge_index`, :obj:`val_neg_edge_index` and
    :obj:`test_pos_edge_index` attributes.
    If :obj:`data` has edge features named :obj:`edge_attr`, then
    :obj:`train_pos_edge_attr`, :obj:`val_pos_edge_attr` and
    :obj:`test_pos_edge_attr` will be added as well.

    Args:
        data (Data): The data object.
        val_ratio (float, optional): The ratio of positive validation edges.
            (default: :obj:`0.05`)
        test_ratio (float, optional): The ratio of positive test edges.
            (default: :obj:`0.1`)

    :rtype: :class:`torch_geometric.data.Data`
    """

    assert 'batch' not in data  # No batch-mode.

    num_nodes = data.num_nodes
    original_edge_index = data.edge_index
    row, col = data.edge_index
    edge_attr = data.edge_attr
    data.edge_index = data.edge_attr = None

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    if edge_attr is not None:
        edge_attr = edge_attr[mask]

    n_v = int(math.floor(val_ratio * row.size(0)))
    n_t = int(math.floor(test_ratio * row.size(0)))

    # Positive edges.
    perm = torch.randperm(row.size(0))
    row, col = row[perm], col[perm]
    if edge_attr is not None:
        edge_attr = edge_attr[perm]

    r, c = row[:n_v], col[:n_v]
    data.val_pos_edge_index = torch.stack([r, c], dim=0)
    if edge_attr is not None:
        data.val_pos_edge_attr = edge_attr[:n_v]

    r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)
    if edge_attr is not None:
        data.test_pos_edge_attr = edge_attr[n_v:n_v + n_t]

    r, c = row[n_v + n_t:], col[n_v + n_t:]

    # this section is custom
    # -----------------------
    data.train_pos_edge_index = torch.stack([r, c], dim=0)

    helper = data.train_pos_edge_index

    # if edge_attr is not None:
    #     out = to_undirected(data.train_pos_edge_index, edge_attr[n_v + n_t:])
    #     data.edge_index, data.edge_attr = out
    # else:
    #     data.edge_index = to_undirected(data.train_pos_edge_index)

    data.train_pos_edge_index = helper

    if edge_attr is not None:
        data.train_pos_edge_attr = edge_attr[n_v + n_t:]
    # -----------------------

    data.edge_index = original_edge_index

    
    # generate negative edge list by randomly sampling the nodes!
    neg_edge_list = np.array(np.random.randint(low=0, high=num_nodes,
                                               size=(2*data.edge_index.shape[1],)). # left and right edge - 2x, to be safe:3.4
                             reshape((data.edge_index.shape[1],2)))

    a = np.min(neg_edge_list, axis=1)
    b = np.max(neg_edge_list, axis=1)

    neg_edge_list = np.vstack((a,b)).transpose()

    # filter for unique edges in the negative edge list

    # obtain the indexes of the first occuring objects
    # _, indices = np.unique(edges[:,[0,1]],return_index=True,axis=0)
    _, indices = np.unique(neg_edge_list[:,[0,1]],return_index=True,axis=0)

    neg_edge_list = neg_edge_list[indices]

    all_edges = np.concatenate((np.array(data.edge_index.t()),neg_edge_list), axis=0) # concat positive edges of graph and negative edges

    # obtain the indexes of unique objects
    _, indices = np.unique(all_edges[:, [0, 1]], return_index=True, axis=0)

    # sort indices

    indices = np.sort(indices)
    indices = indices[indices > data.edge_index.shape[1]] # remove the indices of the positive edges!
    neg_edge_list = torch.tensor(all_edges[indices])

    # sample edges according to percentage

    ind = torch.randperm(neg_edge_list.shape[0])

    data.val_neg_edge_index = neg_edge_list[ind[:n_v]].t()
    data.test_neg_edge_index = neg_edge_list[ind[n_v:n_v+n_t]].t()
    data.train_neg_edge_index = neg_edge_list[ind[n_v+n_t:n_v+n_t+data.train_pos_edge_index.shape[1]]].t()

    """
    #Original Sampling: allocates to much memory

    # Negative edges.
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0

    neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t()
    ind = torch.randperm(neg_row.size(0))
    perm = ind[:n_v + n_t]
    perm_train = ind[n_v+n_t:n_v+n_t+data.train_pos_edge_index.shape[1]]
    neg_row_train, neg_col_train = neg_row[perm_train], neg_col[perm_train]
    neg_row, neg_col = neg_row[perm], neg_col[perm]

    neg_adj_mask[neg_row, neg_col] = 0
    data.train_neg_adj_mask = neg_adj_mask

    row, col = neg_row[:n_v], neg_col[:n_v]
    data.val_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row[n_v:n_v + n_t], neg_col[n_v:n_v + n_t]
    data.test_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row_train , neg_col_train
    data.train_neg_edge_index = torch.stack([row, col], dim=0)
    """

    return data

In [None]:
val_ratio = 0.2
test_ratio = 0.4

In [None]:
data = custom_train_test_split_edges(data, val_ratio=val_ratio, test_ratio = test_ratio)

In [None]:
data

In [None]:
del data.train_mask,data.val_mask,data.test_mask

In [None]:
split_edge = {'train': {}, 'valid': {}, 'test': {}}

In [None]:
split_edge['train']['edge'] = data.train_pos_edge_index.t() # these are only one directional
split_edge['train']['edge_neg'] = data.train_neg_edge_index.t() # these are only one directional
split_edge['valid']['edge'] = data.val_pos_edge_index.t() # these are only one directional
split_edge['valid']['edge_neg'] = data.val_neg_edge_index.t()  # these are only one directional
split_edge['test']['edge'] = data.test_pos_edge_index.t()  # these are only one directional
split_edge['test']['edge_neg'] = data.test_neg_edge_index.t()  # these are only one directional

In [None]:
split_edge['train']['edge_attr'] = data.train_pos_edge_attr.t() # these are only one directional
split_edge['test']['edge_attr'] = data.test_pos_edge_attr.t() # these are only one directional
split_edge['valid']['edge_attr'] = data.val_pos_edge_attr.t() # these are only one directional

In [None]:
saver.save_split(split_edge, split_name = 'random')

In [None]:
split_edge

In [None]:
mapping_path = prefix+'/mapping/'

# prepare mapping information first and store it under this directory (empty below).
os.makedirs(mapping_path,exist_ok=True)
try:
    os.mknod(os.path.join(mapping_path, 'README.md'))
except:
    print("Readme.md already exists.")
saver.copy_mapping_dir(mapping_path)

In [None]:
saver.save_task_info(task_type = 'link prediction', eval_metric = 'mrr')

In [None]:
meta_dict = saver.get_meta_dict()
print(meta_dict)

In [None]:
saver.zip()
saver.cleanup()

In [None]:
filedir = prefix+'/datasets/ogb_graph/submission_ogbl_nftgraph/nftgraph.zip'
dstdirs = prefix+'/datasets/ogb_graph/submission_ogbl_nftgraph/nftgraph'

In [None]:
!unzip $filedir -d $dstdirs

In [None]:
dataset = LinkPropPredDataset(dataset_name,root=prefix+'/datasets/', meta_dict = meta_dict)

In [None]:
dataset.get_edge_split()

In [None]:
dfedges

In [None]:
tgb_edges = dfedges[['timestamp','from','to','TxnsCnt']]

In [None]:
tgb_edges

In [None]:
tgb_edges.to_csv(prefix+'/datasets/tgb_graph/tgbl_nftgraph/tgbl-nftgraph_edgelist.csv',index=False)