In [2]:
import dgl
import torch as th

In [3]:
glist, dims = dgl.load_graphs("oag_cs.bin")
g = glist[0]

In [5]:

train_mask = g.nodes['paper'].data["train_mask"]
test_mask = g.nodes['paper'].data["test_mask"]
train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

In [8]:
subg=dgl.sampling.sample_neighbors(g,{'paper':train_idx[:64]},1)

In [17]:
sg = dgl.to_block(subg, {"paper": train_idx[:64]})

In [18]:
sg

Block(num_src_nodes={'affiliation': 0, 'author': 60, 'field': 59, 'paper': 125, 'venue': 1},
      num_dst_nodes={'affiliation': 0, 'author': 0, 'field': 0, 'paper': 64, 'venue': 0},
      num_edges={('affiliation', 'affiliation-author', 'author'): 0, ('author', 'author-affiliation', 'affiliation'): 0, ('author', 'author-paper', 'paper'): 60, ('field', 'field-field', 'field'): 0, ('field', 'field-paper', 'paper'): 64, ('paper', 'paper-author', 'author'): 0, ('paper', 'paper-field', 'field'): 0, ('paper', 'paper-paper', 'paper'): 64, ('paper', 'paper-venue', 'venue'): 0, ('venue', 'venue-paper', 'paper'): 64},
      metagraph=[('affiliation', 'author', 'affiliation-author'), ('author', 'affiliation', 'author-affiliation'), ('author', 'paper', 'author-paper'), ('paper', 'author', 'paper-author'), ('paper', 'field', 'paper-field'), ('paper', 'paper', 'paper-paper'), ('paper', 'venue', 'paper-venue'), ('field', 'field', 'field-field'), ('field', 'paper', 'field-paper'), ('venue', 'paper', 

In [19]:
import dgl
import dgl.function as fn
import numpy as np
import torch as th
import scipy.sparse as ssp
import array
import torch



class Budget(object):
    def __init__(self, hg, n_types, NS):
        self.n_types = {}
        for key, value in n_types.items():
            self.n_types[key] = th.zeros(value)
        self.NS = NS
        self.hg = hg
    def update(self, dst_type, idxs):
        for etype in self.hg.canonical_etypes:
            if dst_type == etype[2]:
                src_type = etype[0]
                #degree = self.hg.in_degrees(idx, etype=etype)
                for i in idxs:
                    src_idx = self.hg.predecessors(i, etype=etype)
                    #if src_idx.shape[0] > 0:
                    len = src_idx.shape[0]
                    if src_type in self.NS.keys():
                        src_idx = th.tensor([i for i in src_idx if i not in self.NS[src_type]])
                    if src_idx.shape[0] > 0:
                        self.n_types[src_type][src_idx] += 1 / len

    def pop(self, type, idx):
        self.n_types[type][idx] = 0


class HGTsampler(object):
    def __init__(self, hg, category, num_nodes_per_type, num_steps):
        self.n_types = {}
        for n in hg.ntypes:
            self.n_types[n] = hg.num_nodes(n)
        self.category = category
        self.num_nodes_per_type = num_nodes_per_type
        self.num_steps = num_steps
        self.hg = hg

    def sampler_subgraph(self, seed_nodes):
        OS = {self.category: th.stack(seed_nodes)}
        NS = OS
        B = Budget(self.hg, self.n_types, NS)
        for type, idxs in OS.items():
            B.update(type, idxs)
        for i in range(self.num_steps):
            prob = {}
            for src_type, p in B.n_types.items():
                #print(src_type)
                if p.max() > 0:
                    prob[src_type] = p / th.sum(p)
                    sampled_idx = th.multinomial(prob[src_type], self.num_nodes_per_type, replacement=False)
                    if not OS.__contains__(src_type):
                        OS[src_type] = sampled_idx
                    else:
                        OS[src_type] = th.cat((OS[src_type], sampled_idx))
                    B.update(src_type, sampled_idx)
                    B.pop(src_type, sampled_idx)
        sg = self.hg.subgraph(OS)
        return sg, OS




def HGT_preprocess4mag(hg, train_idx):
    hg = hg.to('cpu')
    edges = {etype: hg.edges(etype=etype) for etype in hg.canonical_etypes}
    edges.update({(v, e + '_inv', u): (dst, src) for (u, e, v), (src, dst) in edges.items()})
    hg2 = dgl.heterograph(edges)
    hg2 = dgl.to_simple(hg2)

    # Initialize year
    hg2.nodes['paper'].data['timestamp'] = hg.nodes['paper'].data['year'].squeeze()
    for ntype in hg.ntypes:
        if ntype != 'paper':
            hg2.nodes[ntype].data['timestamp'] = th.zeros(hg2.num_nodes(ntype), dtype=th.int64)

    # Aggregate bag-of-paper features
    hg2.nodes['paper'].data['feat'] = hg.nodes['paper'].data['feat']
    hg2.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='has_topic')  # field_of_study
    hg2.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='writes_inv')  # author
    hg2.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='affiliated_with')  # institution

    # Attach log-degree to feature of each node type
    for ntype in hg2.ntypes:
        hg2.nodes[ntype].data['deg'] = th.zeros(hg2.num_nodes(ntype))
    for utype, etype, vtype in hg2.canonical_etypes:
        hg2.nodes[vtype].data['deg'] += hg2.in_degrees(etype=etype)
    for ntype in hg2.ntypes:
        hg2.nodes[ntype].data['feat'] = th.cat([
            hg2.nodes[ntype].data['feat'],
            th.log10(hg2.nodes[ntype].data['deg'][:, None])], 1)
        del hg2.nodes[ntype].data['deg']

    for ntype in hg2.ntypes:
        hg2.nodes[ntype].data['train_mask'] = torch.zeros(hg2.num_nodes(ntype), dtype=torch.bool)
        if ntype == 'paper':
            hg2.nodes[ntype].data['train_mask'][train_idx['paper']] = True

    # Convert to homogeneous graph and add self-loop
    g = dgl.to_homogeneous(hg2, ndata=['timestamp', 'feat'])
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
    g.ndata['nid'] = g.ndata[dgl.NID]
    del g.edata[dgl.ETYPE]
    del g.edata[dgl.EID]
    del g.ndata[dgl.NTYPE]
    del g.ndata[dgl.NID]
    num_nodes = g.num_nodes()
    g = dgl.add_self_loop(g)
    g.edata['etype'][-num_nodes:] = len(hg2.etypes)

    return g

In [21]:
source_sampler = HGTsampler(g, "paper", 1, 4)
target_sampler = HGTsampler(g, "author", 1, 4)

In [22]:
dataloader = torch.utils.data.DataLoader(
    train_idx,
    batch_size=64,
    collate_fn=source_sampler.sampler_subgraph,
    # num_workers=self.args.num_workers,
    shuffle=True,
    drop_last=False,
)

In [24]:
a = next(iter(dataloader))

In [27]:
a[0]

Graph(num_nodes={'affiliation': 3, 'author': 4, 'field': 4, 'paper': 68, 'venue': 4},
      num_edges={('affiliation', 'affiliation-author', 'author'): 4, ('author', 'author-affiliation', 'affiliation'): 4, ('author', 'author-paper', 'paper'): 4, ('field', 'field-field', 'field'): 0, ('field', 'field-paper', 'paper'): 12, ('paper', 'paper-author', 'author'): 4, ('paper', 'paper-field', 'field'): 12, ('paper', 'paper-paper', 'paper'): 4, ('paper', 'paper-venue', 'venue'): 6, ('venue', 'venue-paper', 'paper'): 6},
      metagraph=[('affiliation', 'author', 'affiliation-author'), ('author', 'affiliation', 'author-affiliation'), ('author', 'paper', 'author-paper'), ('paper', 'author', 'paper-author'), ('paper', 'field', 'paper-field'), ('paper', 'paper', 'paper-paper'), ('paper', 'venue', 'paper-venue'), ('field', 'field', 'field-field'), ('field', 'paper', 'field-paper'), ('venue', 'paper', 'venue-paper')])

In [28]:
a[1]

{'paper': tensor([ 33651, 264064,  14129,  39863, 543680, 260598,   1306, 461042, 137527,
         294628, 374634, 439779, 517751, 286602, 418241,  67096, 245625, 411516,
         296457, 362784, 238363, 507455, 258262, 499626, 481229, 319118, 510740,
         230058, 402512, 372653,  83105, 398140, 538131, 137585, 295265, 198069,
         152263, 103080, 356339,  83636, 350655, 494942, 339255, 235273, 457007,
         287447,  33411, 263254, 379369,  84454, 343389, 278408, 343024, 406782,
         407633, 282809, 300775, 396011, 410717,  25376, 170043, 408125, 196552,
         126510, 259669, 264690, 350802, 170224]),
 'author': tensor([316872, 129736, 316873, 263227]),
 'field': tensor([11117, 28920, 26443,  6745]),
 'venue': tensor([6216, 4184, 3527, 4798]),
 'affiliation': tensor([ 903, 5136, 2587])}

In [30]:
sg=a[0]

In [37]:
# sampler = dgl.dataloading.MultiLayerFullNeighborSampler(4)
sampler = dgl.dataloading.NeighborSampler([4, 4, 4, 4])
dataloader = dgl.dataloading.DataLoader(
    g,
    {'paper':train_idx},
    sampler,
    batch_size=64,
    shuffle=True,
    drop_last=False,
    # num_workers=4,
)

In [38]:
dataloader

<dgl.dataloading.dataloader.DataLoader at 0x7f8f213e4bd0>

In [39]:
input_nodes, output_nodes, blocks = next(iter(dataloader))



In [40]:
for key, valud in input_nodes.items():
    print(key, valud.shape)

affiliation torch.Size([1456])
author torch.Size([27224])
field torch.Size([16016])
paper torch.Size([95813])
venue torch.Size([2276])


In [10]:
input_nodes

{'affiliation': tensor([  12, 1162,  401,  ..., 1976, 8583, 2821]),
 'author': tensor([ 39272, 166534, 166535,  ..., 471093, 471095, 471096]),
 'field': tensor([19352, 10896, 34017,  ..., 40603, 12309, 27933]),
 'paper': tensor([ 64303, 428584, 254819,  ..., 466368, 535494, 323348]),
 'venue': tensor([ 562, 5285, 3053,  ..., 1862, 3666,  360])}

In [11]:
g.ndata["feat"]

{'affiliation': tensor([[ 0.0029, -0.0018,  0.0022,  ..., -0.0012,  0.0005,  0.0403],
         [ 0.0063,  0.0010,  0.0018,  ..., -0.0010,  0.0011,  0.0296],
         [ 0.0037,  0.0021,  0.0017,  ..., -0.0016,  0.0004,  0.0286],
         ...,
         [ 0.0008, -0.0025,  0.0030,  ..., -0.0001, -0.0033,  0.0099],
         [ 0.0025, -0.0022,  0.0038,  ...,  0.0043, -0.0015,  0.0042],
         [ 0.0029, -0.0014, -0.0007,  ...,  0.0044,  0.0012,  0.0136]]),
 'author': tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  8.7267e-05,
          -1.3538e-04,  5.7543e-03],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.3289e-03,
           9.0451e-04,  9.6308e-03],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.7089e-04,
           3.0572e-03,  4.0321e-03],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00, -1.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00, -1.0000e

In [79]:
labels = g.ndata["L1"]["author"]

In [80]:
lbl_dim = dims["L1"]

In [44]:
processed_labels = {}
for task_type in ["L1"]:
    processed_labels[task_type] = {}
    for node_type in ["author"]:
        labels = g.ndata[task_type][node_type]
        if labels.shape[1] != dims[task_type]:
            processed_labels[task_type][node_type] = torch.zeros(
                (labels.shape[0], dims[task_type])
            )
            num_indices = torch.count_nonzero(labels + 1, dim=1)
            for i in range(labels.shape[0]):
                indices = labels[i, : num_indices[i]].to(torch.int)
                processed_labels[task_type][node_type][i, indices] = 1 / num_indices[
                    i
                ].clamp(min=1)
        else:
            processed_labels[task_type][node_type] = labels

RuntimeError: shape mismatch: value tensor of shape [510189, 275] cannot be broadcast to indexing result of shape [668109]

In [66]:
def process_category(labels: torch.tensor, num_classes: int) -> torch.tensor:
    if labels.shape[1] != num_classes:
        processed_labels = torch.zeros((labels.shape[0], num_classes))
        num_indices = torch.count_nonzero(labels + 1, dim=1)
        for i in range(labels.shape[0]):
            indices = labels[i, : num_indices[i]].to(torch.int)
            processed_labels[i, indices] = 1 / num_indices[i].clamp(min=1)
    else:
        processed_labels = labels
    return processed_labels

In [43]:
lb = g.ndata["L1"]["author"]

In [49]:
num_indices

In [67]:
a = process_category(lb[:50], dims["L1"].item())

In [68]:
a

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [34]:
lb[42]

tensor([19., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1.])

In [35]:
a[42][19]

tensor(1.)

In [3]:
g.adj(etype='paper-author')


SparseMatrix(indices=tensor([[387067, 134013, 134013,  ..., 102817, 304115, 529407],
                             [     1,      1,      2,  ..., 510171, 392963, 510188]]),
             values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
             shape=(544244, 510189), nnz=1091559)

In [6]:
import dgl.sparse