In [1]:
import numpy as np
import dgl
import torch
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# loading data
GRAPHPATH = "/home/bear/workspace/single-gnn/data/raid/ogbn_products/graph.bin"
TRAINPATH = "/home/bear/workspace/single-gnn/data/raid/ogbn_products/trainIds.bin"
FEATPATH = "/home/bear/workspace/single-gnn/data/raid/ogbn_products/feat.bin"
featlen = 100
nodeNUM = 2449029
graph = torch.tensor(np.fromfile(GRAPHPATH,dtype=np.int32))
src = graph[::2]
dst = graph[1::2]
trainIds = torch.tensor(np.fromfile(TRAINPATH,dtype=np.int64))
feat = torch.tensor(np.fromfile(FEATPATH,dtype=np.float32).reshape(-1,featlen))
edgeTable = torch.zeros_like(src).to(torch.int32)
template_array = torch.zeros(nodeNUM,dtype=torch.int32)

In [3]:
# compute degree
inNodeTable = copy.deepcopy(template_array)
outNodeTable = copy.deepcopy(template_array)
inNodeTable,outNodeTable = dgl.sumDegree(inNodeTable.cuda(),outNodeTable.cuda(),src.cuda(),dst.cuda())
inNodeTable = inNodeTable.cpu()
outNodeTable = outNodeTable.cpu()

In [4]:
# run pagerank 方向相反
nodeValue = copy.deepcopy(template_array)
nodeInfo = copy.deepcopy(template_array)
nodeValue[trainIds] = 10000

partitionNUM = 4
shuffled_indices = torch.randperm(trainIds.size(0))
r_trainId = trainIds[shuffled_indices]
trainBatch = torch.chunk(r_trainId, partitionNUM, dim=0)

for index,ids in enumerate(trainBatch):
    info = 1 << index
    nodeInfo[ids] = info


In [5]:
dst = dst.cuda()
src = src.cuda()
raw_nodeInfo = copy.deepcopy(nodeInfo)
raw_edgeTable = copy.deepcopy(edgeTable)
edgeTable = edgeTable.cuda()
inNodeTable = inNodeTable.cuda()
nodeValue = nodeValue.cuda()
nodeInfo = nodeInfo.cuda()
for i in range(3):    
    dgl.per_pagerank(dst,src,edgeTable,inNodeTable,nodeValue,nodeInfo)
dst = dst.cpu()
src = src.cpu()
edgeTable = edgeTable.cpu()
inNodeTable = inNodeTable.cpu()
nodeValue = nodeValue.cpu()
nodeInfo = nodeInfo.cpu()

In [6]:
idList2part = []
prValue2part = []
feat2part = []
edge2part = []
for bit_position in range(4):
    nodeIndex = (nodeInfo & (1 << bit_position)) != 0
    edgeIndex = (edgeTable & (1 << bit_position)) != 0
    nid = torch.nonzero(nodeIndex).reshape(-1)
    eid = torch.nonzero(edgeIndex).reshape(-1)
    graph = graph.reshape(-1,2)
    subEdge = graph[eid]
    partValue = nodeValue[nid]
    partFeat = feat[nid]
    idList2part.append(nid)
    prValue2part.append(partValue)
    feat2part.append(partFeat)
    edge2part.append(subEdge)

torch.Size([122260331])
torch.Size([122232607])
torch.Size([122195000])
torch.Size([122224461])


In [7]:
# 分区处理获得节点
partIndex = 0
partid = idList2part[partIndex]
partValue = prValue2part[partIndex]
partFeat = feat2part[partIndex]
subG = edge2part[partIndex]
prValue, indices = torch.sort(partValue, descending=True)
partid = partid[indices]
partFeat = partFeat[indices]

In [13]:
srcList = copy.deepcopy(subG[:,0])
srcList = srcList.cuda()
dstList = copy.deepcopy(subG[:,1])
dstList = dstList.cuda()
uniTable = torch.zeros_like(partid,dtype=torch.int32).cuda()
partid = partid.to(torch.int32).cuda()
srcList,dstList,uniTable = dgl.mapByNodeSet(partid,uniTable,srcList,dstList)
srcList = srcList.cpu()
dstList = dstList.cpu()
uniTable = uniTable.cpu()

In [15]:
sort_dstList,indice = torch.sort(dstList,dim=0) # 有待提高
sort_srcList = srcList[indice]

In [20]:
nodeSize = uniTable.shape[0]
edgeSize = srcList.shape[0]
fix_NUM = int(nodeSize * 0.1)
position = torch.searchsorted(sort_dstList, fix_NUM)

122260331
234171


tensor(41989236)

In [38]:
fix_indice = sort_srcList[:position]
fix_inptr = torch.cat([torch.Tensor([0]).to(torch.int32),torch.cumsum(torch.bincount(sort_dstList[:position]), dim=0)]).to(torch.int32)
random_dst = sort_dstList[position:]
random_src = sort_srcList[position:]

In [39]:
mapTable = torch.zeros_like(uniTable).to(torch.int32) - 1
fix_index = torch.arange(fix_NUM-1).to(torch.int32)
mapTable[fix_index.to(torch.int64)] = fix_index

In [40]:
choice_src = random_src[30000000:60000000]
choice_dst = random_dst[30000000:60000000]

In [48]:
choice_ids = torch.nonzero(torch.bincount(choice_dst)).reshape(-1)
choice_ids

tensor([ 593590,  593591,  593592,  ..., 1219406, 1219407, 1219408])

In [49]:
choice_index = torch.arange(len(choice_ids)).to(torch.int32) + fix_NUM
choice_index

tensor([234171, 234172, 234173,  ..., 859605, 859606, 859607],
       dtype=torch.int32)

In [50]:
mapTable[choice_ids] = choice_index

In [55]:
cumList = torch.bincount(choice_dst)[choice_ids]
cumList

tensor([71, 36, 25,  ..., 80, 23, 57])

In [56]:
choice_indice = choice_src
choice_inptr = torch.cumsum(cumList, dim=0).to(torch.int32) + fix_inptr[-1]

In [58]:
inptr = torch.cat([fix_inptr,choice_inptr])
indice = torch.cat([fix_indice,choice_indice])

In [59]:
inptr

tensor([       0,     6169,    16086,  ..., 71989156, 71989179, 71989236],
       dtype=torch.int32)

In [60]:
indice.shape

torch.Size([71989236])