In [18]:
import os
import torch
from torch_geometric.data import HeteroData


filename = 'HeteroData_Learnings_v1.pt'
if os.path.exists('./'+filename):
    data = HeteroData.from_dict(torch.load('./'+filename))
    print('loading saved heterodata object')

loading saved heterodata object


In [19]:
# get size of the data on disk in gb
import os
size = os.path.getsize('./'+filename)
print('size of file on disk: ', size/1e9, 'gb')

size of file on disk:  1.790098703 gb


In [20]:
# sampler for Heterogeneous Graph Transformer

In [21]:
# for each node type, add a new edge type only consisting of self loops
# this is done to allow HGT to attend to the previous node representations
# for node_type in data.node_types:
#     data[node_type, 'self_loop', node_type] = torch.cat((torch.arange(data[node_type].num_nodes),torch.arange(data[node_type].num_nodes)), dim=0)

In [22]:
# split

In [23]:
data

HeteroData(
  courses_and_programs={
    TITLE=[55796],
    DESCRIPTION=[55796],
    x=[55796, 814],
  },
  qualifications={
    TITLE=[1242],
    DESCRIPTION=[1242],
    x=[1242, 785],
  },
  skills={
    TITLE=[138698],
    x=[138698, 772],
  },
  people={ x=[293444, 24] },
  jobs={
    TITLE=[55638],
    x=[55638, 773],
  },
  organizations={
    num_nodes=13613,
    x=[13613, 2],
  },
  (skills, qualification_skill, qualifications)={ edge_index=[2, 1596] },
  (skills, course_and_program_skill, courses_and_programs)={ edge_index=[2, 258099] },
  (courses_and_programs, course_qualification, qualifications)={ edge_index=[2, 2099] },
  (courses_and_programs, course_and_programs_student, people)={
    edge_index=[2, 553454],
    edge_attr=[553454, 3],
  },
  (jobs, job_student, people)={ edge_index=[2, 293444] },
  (people, supervisor_supervisee, people)={ edge_index=[2, 217922] },
  (people, organization_student, organizations)={ edge_index=[2, 292060] },
  (jobs, job_job, jobs)={
    

In [24]:
from torch_geometric import seed_everything
import torch_geometric.transforms as T


edge_types = []
rev_edge_types = []
for edge_type in data.edge_types:
    if edge_type[1].startswith('rev_'):
        rev_edge_types.append(edge_type)
    else:
        edge_types.append(edge_type)

transform = T.RandomLinkSplit(
    is_undirected=True,
    edge_types=edge_types,
    rev_edge_types=rev_edge_types,
    num_val=0.02,
    num_test=0.02,
    add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
    neg_sampling_ratio=1.0,
    disjoint_train_ratio=0, #  training edges are shared for message passing and supervision
    )

seed_everything(14)
train_data, val_data, test_data = transform(data)



# train_data = add_self_loops(train_data)
# val_data = add_self_loops(val_data)
# test_data = add_self_loops(test_data)

# train_data
    

In [26]:
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.loader import HGTLoader
from torch_geometric.sampler import NegativeSampling

num_neighbors_linkloader = {}
for edge_type in train_data.edge_types:
    # if edge_type[1] == 'self_loop':
    #     num_neighbors[edge_type] = [0,0]
    num_neighbors[edge_type] = [0,0]
    # num_neighbors['qualifications', 'self_loops', 'qualifications'] = [1,0]
    # num_neighbors['qualifications', 'rev_qualification_skill', 'skills'] = [10,0]

negative_sampling = NegativeSampling(
        mode='binary',
        amount=10  # ratio, like Graphsage
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )
num_workers = 0
# delete edge_attr of every edge type
for edge_type in train_data.edge_types:
    del train_data[edge_type].edge_attr 

# delete all keys for every node type except 'x' (e.g. description and title)
for node_type in train_data.node_types:
    keys = list(train_data[node_type].keys())
    for key in keys:
        if key != 'x':
            del train_data[node_type][key]

linkNeighborLoader = LinkNeighborLoader(
        train_data,
        num_neighbors=num_neighbors_linkloader,
        edge_label_index=(('skills', 'qualification_skill', 'qualifications'), train_data['skills', 'qualification_skill', 'qualifications'].edge_label_index), # if (edge, None), None means all edges are considered
     
        neg_sampling=negative_sampling, # adds negative samples
        batch_size=64,
        shuffle=True,
        #drop_last=True,
        num_workers=num_workers,
        directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
        #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
        pin_memory=True, # faster data transfer to gpu
        #num_workers=2,
        #prefetch_factor=2
)

def get_hgt(data, )
    hgtLoader = HGTLoader(
        hetero_data,
        # Sample 512 nodes per type and per iteration for 4 iterations
        num_samples={key: [512] * 4 for key in hetero_data.node_types},
        # Use a batch size of 128 for sampling training nodes of type paper
        batch_size=128,
        input_nodes=('paper', hetero_data['paper'].train_mask),
    )


def add_self_loops(data):
    for node_type in data.node_types:
        data[node_type, 'self_loop', node_type].edge_index = torch.arange(data[node_type].num_nodes).repeat(2,1)
    return data 
# add a yield wrapper
def get_loader_with_selfloops(loader):
    for batch in loader:
        yield add_self_loops(batch)

In [27]:
#loader_with_selfloops = get_loader_with_selfloops(loader)

In [37]:
from torch_geometric.loader import HGTLoader
from torch_geometric.datasets import OGB_MAG

hetero_data = OGB_MAG(path)[0]

loader = HGTLoader(
    hetero_data,
    # Sample 512 nodes per type and per iteration for 4 iterations
    num_samples={key: [512] * 4 for key in hetero_data.node_types},
    # Use a batch size of 128 for sampling training nodes of type paper
    batch_size=128,
    input_nodes=('paper', hetero_data['paper'].train_mask),
)

sampled_hetero_data = next(iter(loader))
print(sampled_data.batch_size)

NameError: name 'path' is not defined

In [10]:
from torch_geometric.loader import HGTLoader
from torch_geometric.datasets import OGB_MAG

hetero_data = OGB_MAG('tests/')[0]

loader = HGTLoader(
    hetero_data,
    # Sample 512 nodes per type and per iteration for 4 iterations
    num_samples={key: [512] * 4 for key in hetero_data.node_types},
    # Use a batch size of 128 for sampling training nodes of type paper
    batch_size=736389,
    input_nodes=('paper', None),
)

sampled_hetero_data = next(iter(loader))
#print(sampled_hetero_data.batch_size)

In [None]:
631619

In [12]:
next(iter(loader))

HeteroData(
  paper={
    x=[736389, 128],
    year=[736389],
    y=[736389],
    train_mask=[736389],
    val_mask=[736389],
    test_mask=[736389],
    n_id=[736389],
    input_id=[736389],
    batch_size=736389,
  },
  author={
    num_nodes=2048,
    n_id=[2048],
  },
  institution={
    num_nodes=0,
    n_id=[0],
  },
  field_of_study={
    num_nodes=0,
    n_id=[0],
  },
  (author, affiliated_with, institution)={
    edge_index=[2, 0],
    e_id=[0],
  },
  (author, writes, paper)={
    edge_index=[2, 94946],
    e_id=[94946],
  },
  (paper, cites, paper)={
    edge_index=[2, 4710815],
    e_id=[4710815],
  },
  (paper, has_topic, field_of_study)={
    edge_index=[2, 0],
    e_id=[0],
  }
)