In [1]:
import dgl
import torch

In [2]:
glist, dims = dgl.load_graphs("oag_cs.bin")
g = glist[0]

In [3]:
train_mask = g.ndata["train_mask"]

In [4]:
import torch

train_mask = {}
for ntype in ["author", "paper", "venue"]:
    idx = torch.nonzero(g.ndata["train_mask"][ntype], as_tuple=False).squeeze()
    train_mask[ntype] = idx

In [5]:
train_mask

{'author': tensor([     1,      3,      4,  ..., 510167, 510168, 510175]),
 'paper': tensor([    34,     39,     47,  ..., 544224, 544233, 544236]),
 'venue': tensor([   0,    1,    5,  ..., 6930, 6931, 6932])}

In [6]:
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(4)
dataloader = dgl.dataloading.DataLoader(
    g,
    {"paper": train_mask["paper"]},
    sampler,
    batch_size=64,
    shuffle=True,
    drop_last=False,
    # num_workers=4,
)

In [7]:
dataloader

<dgl.dataloading.dataloader.DataLoader at 0xfffdfee24c90>

In [8]:
input_nodes, output_nodes, blocks = next(iter(dataloader))



In [9]:
for key, valud in input_nodes.items():
    print(key, valud.shape)

affiliation torch.Size([7830])
author torch.Size([471707])
field torch.Size([45717])
paper torch.Size([544243])
venue torch.Size([6916])


In [10]:
input_nodes

{'affiliation': tensor([  12, 1162,  401,  ..., 1976, 8583, 2821]),
 'author': tensor([ 39272, 166534, 166535,  ..., 471093, 471095, 471096]),
 'field': tensor([19352, 10896, 34017,  ..., 40603, 12309, 27933]),
 'paper': tensor([ 64303, 428584, 254819,  ..., 466368, 535494, 323348]),
 'venue': tensor([ 562, 5285, 3053,  ..., 1862, 3666,  360])}

In [11]:
g.ndata["feat"]

{'affiliation': tensor([[ 0.0029, -0.0018,  0.0022,  ..., -0.0012,  0.0005,  0.0403],
         [ 0.0063,  0.0010,  0.0018,  ..., -0.0010,  0.0011,  0.0296],
         [ 0.0037,  0.0021,  0.0017,  ..., -0.0016,  0.0004,  0.0286],
         ...,
         [ 0.0008, -0.0025,  0.0030,  ..., -0.0001, -0.0033,  0.0099],
         [ 0.0025, -0.0022,  0.0038,  ...,  0.0043, -0.0015,  0.0042],
         [ 0.0029, -0.0014, -0.0007,  ...,  0.0044,  0.0012,  0.0136]]),
 'author': tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  8.7267e-05,
          -1.3538e-04,  5.7543e-03],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.3289e-03,
           9.0451e-04,  9.6308e-03],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.7089e-04,
           3.0572e-03,  4.0321e-03],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00, -1.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00, -1.0000e

In [79]:
labels = g.ndata["L1"]["author"]

In [80]:
lbl_dim = dims["L1"]

In [44]:
processed_labels = {}
for task_type in ["L1"]:
    processed_labels[task_type] = {}
    for node_type in ["author"]:
        labels = g.ndata[task_type][node_type]
        if labels.shape[1] != dims[task_type]:
            processed_labels[task_type][node_type] = torch.zeros(
                (labels.shape[0], dims[task_type])
            )
            num_indices = torch.count_nonzero(labels + 1, dim=1)
            for i in range(labels.shape[0]):
                indices = labels[i, : num_indices[i]].to(torch.int)
                processed_labels[task_type][node_type][i, indices] = 1 / num_indices[
                    i
                ].clamp(min=1)
        else:
            processed_labels[task_type][node_type] = labels

RuntimeError: shape mismatch: value tensor of shape [510189, 275] cannot be broadcast to indexing result of shape [668109]

In [66]:
def process_category(labels: torch.tensor, num_classes: int) -> torch.tensor:
    if labels.shape[1] != num_classes:
        processed_labels = torch.zeros((labels.shape[0], num_classes))
        num_indices = torch.count_nonzero(labels + 1, dim=1)
        for i in range(labels.shape[0]):
            indices = labels[i, : num_indices[i]].to(torch.int)
            processed_labels[i, indices] = 1 / num_indices[i].clamp(min=1)
    else:
        processed_labels = labels
    return processed_labels

In [43]:
lb = g.ndata["L1"]["author"]

In [49]:
num_indices

In [67]:
a = process_category(lb[:50], dims["L1"].item())

In [68]:
a

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [34]:
lb[42]

tensor([19., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1.])

In [35]:
a[42][19]

tensor(1.)

In [3]:
g.adj(etype='paper-author')


SparseMatrix(indices=tensor([[387067, 134013, 134013,  ..., 102817, 304115, 529407],
                             [     1,      1,      2,  ..., 510171, 392963, 510188]]),
             values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
             shape=(544244, 510189), nnz=1091559)

In [6]:
import dgl.sparse