In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
import dgl
import dgl.nn as dglnn
from dgl.data import AsNodePredDataset
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
from ogb.nodeproppred import DglNodePropPredDataset
import tqdm
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics.functional as MF
import dgl
import dgl.nn as dglnn
from dgl.data import AsNodePredDataset
from ogb.nodeproppred import DglNodePropPredDataset
from dgl.nn.pytorch import GraphConv
import tqdm
import numpy as np
import time

class SAGE(nn.Module):
    def __init__(self, in_size, hid_size, out_size,num_layers=2):
        super().__init__()
        self.layers = nn.ModuleList()
        # three-layer GraphSAGE-mean
        self.layers.append(dglnn.SAGEConv(in_size, hid_size, 'mean'))
        for _ in range(num_layers - 2):
            self.layers.append(dglnn.SAGEConv(hid_size, hid_size, 'mean'))
        self.layers.append(dglnn.SAGEConv(hid_size, out_size, 'mean'))
        self.dropout = nn.Dropout(0.5)
        self.hid_size = hid_size
        self.out_size = out_size

    def forward(self, blocks, x):
        h = x
        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = F.relu(h)
                h = self.dropout(h)
        return h

    def inference(self, g,device, batch_size):
        """Conduct layer-wise inference to get all the node embeddings."""
        feat = g.ndata['feat']
        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=['feat'])
        # sampler = NeighborSampler([15],  # fanout for [layer-0, layer-1, layer-2]
        #                     prefetch_node_feats=['feat'],
        #                     prefetch_labels=['label'])
        dataloader = DataLoader(
                g, torch.arange(g.num_nodes()).to(g.device), sampler, device=device,
                batch_size=batch_size, shuffle=False, drop_last=False,
                num_workers=0)
        buffer_device = torch.device('cpu')
        pin_memory = (buffer_device != device)

        for l, layer in enumerate(self.layers):
            y = torch.empty(
                g.num_nodes(), self.hid_size if l != len(self.layers) - 1 else self.out_size,
                device=buffer_device, pin_memory=pin_memory)
            feat = feat.to(device)
            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader, position=0):
                x = feat[input_nodes]
                h = layer(blocks[0], x) # len(blocks) = 1
                if l != len(self.layers) - 1:
                    h = F.relu(h)
                    h = self.dropout(h)
                # by design, our output nodes are contiguous
                y[output_nodes[0]:output_nodes[-1]+1] = h.to(buffer_device)
            feat = y
        return y
    

In [5]:
import dgl
import pandas
import torch
import numpy as np
path = "/raid/bear/dataset" 
dataset = "uk-2006-05" 
graphbin = "%s/%s/graph.bin" % (path,dataset)
  
labelbin = "%s/%s/labels.bin" % (path,dataset) # 每个节点label 8字节
featsbin = "%s/%s/feats_%d.bin" % (path,dataset,100)


In [6]:
edges = np.fromfile(graphbin,dtype=np.int32)

In [7]:
srcs = edges[::2]
dsts = edges[1::2]

In [8]:
g = dgl.graph((srcs, dsts))

In [9]:
g

Graph(num_nodes=77741023, num_edges=2965197340,
      ndata_schemes={}
      edata_schemes={})

In [10]:
feats = np.fromfile(featsbin,dtype=np.float32).reshape(-1,100)
feats.shape

(77741046, 100)

In [11]:
feats_tmp = feats[:g.num_nodes()]
feats_tmp.shape

(77741023, 100)

In [12]:
label = np.fromfile(labelbin,dtype=np.int64)
label.shape

(77741046,)

In [13]:
g.ndata['feat'] = torch.tensor(feats_tmp)

In [18]:
label= label[:77741023]
g.ndata['label'] = torch.tensor(label)   

In [19]:
                                           
trainnum = int(g.num_nodes() * 0.01)
train_idx = np.arange(trainnum,dtype=np.int64)

In [20]:
train_idx = torch.Tensor(train_idx).to(torch.int64)
train_idx

tensor([     0,      1,      2,  ..., 777407, 777408, 777409])

In [21]:
sampler = NeighborSampler([10,10,10])
use_uva = True

In [22]:
g

Graph(num_nodes=77741023, num_edges=2965197340,
      ndata_schemes={'feat': Scheme(shape=(100,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})

In [23]:
g.num_nodes()

77741023

In [24]:
train_idx

tensor([     0,      1,      2,  ..., 777407, 777408, 777409])

In [25]:
train_dataloader = DataLoader(g, train_idx, sampler, device='cuda',
                                  batch_size=16, shuffle=False,
                                  drop_last=False, num_workers=0,
                                  use_uva=use_uva)