In [39]:
# Sampling
# 1. Sample using HGT Sampler as outlined in the paper, using pyg implementations
# 2. The sampling is adapted to link prediction, by first sampling random supervision edges of which the nodes create the supervision nodes
# a. Dataset is divided across multiple dimensions:
#   a.1. Split into Train, Val, Test split (96, 2, 2)
#   a.2. Training only: Edges are split into those which are used solely for message passing and those solely used for supervision (80, 20). 
#        Because an expressive model (HGT) is used, this prevents the model from memorizing supervision edges by their appearance as message passing edges
#   a.3. This means Training consists of 96%*80% Message passing Edges, 96%*20% supervision edges, Val contains 2% Supervision Edges, Test contains 2% supervison Edges
#   a.4. Validation and Test edges use the Training Message passing Edges as well.
# b. For mini-batch sampling in the training phase, first x random edges are sampled as supervision edges. 
#    For the nodes of these supervision edges, we apply batch-wise HGT Sampling. Due to implementation limitations, for each supervision entity type, the hgt sampling is separate. 
#    This limitation does not apply for sampled neighbor entity types
# during sampling, also the reverse edge of the supervision edge is removed to avoid data leakage


# HGT Sampler (See Paper for further reference)
# The probablity of a neighbor node s to be sampled depends on the normalized degree of all its edge types connecting it to all source nodes
# If neighbor node s is connected to a and b by edge type r, and a has 2 neighbors through edge type r and b has 1 neighbor (node s) through edge type r, 
# then the sampling probablity of s is (1/2+1)**2 / 2**2, if it were connected through other edge types to the nodes as well, those degrees would be added to the numerator and denominator.
# Nodes are sampled without replacement.
# This sampling strategy creates more dense mini-batches, because neighbor nodes which are connected to multiple source nodes and by multiple relationship types are sampled more frequently.
# Therefore, training is sped up since less node representations have to be computed. Furthermore, as stated in the paper, the sampling method allows to sample a 
# similar number of neighbors for each edge type, because high-count edge types and low-count edge types are weighted equally. For each neighbor node type T, a fixed number n of nodes is sampled.




In [40]:
import os
import torch
from torch_geometric.data import HeteroData


filename = 'HeteroData_Learnings_v1.pt'
if os.path.exists('./'+filename):
    data = HeteroData.from_dict(torch.load('./'+filename))
    print('loading saved heterodata object')

loading saved heterodata object


In [41]:
# get size of the data on disk in gb
import os
size = os.path.getsize('./'+filename)
print('size of file on disk: ', size/1e9, 'gb')

size of file on disk:  1.790098703 gb


In [42]:
# sampler for Heterogeneous Graph Transformer

In [43]:
# for each node type, add a new edge type only consisting of self loops
# this is done to allow HGT to attend to the previous node representations
# for node_type in data.node_types:
#     data[node_type, 'self_loop', node_type] = torch.cat((torch.arange(data[node_type].num_nodes),torch.arange(data[node_type].num_nodes)), dim=0)

In [44]:
# split

In [45]:
from torch_geometric import seed_everything
import torch_geometric.transforms as T
from torch_geometric.utils import sort_edge_index

edge_types = []
rev_edge_types = []
for edge_type in data.edge_types:
    if edge_type[1].startswith('rev_'):
        rev_edge_types.append(edge_type)
    else:
        edge_types.append(edge_type)

transform = T.RandomLinkSplit(
    is_undirected=True,
    edge_types=edge_types,
    rev_edge_types=rev_edge_types,
    num_val=0.02,
    num_test=0.02,
    add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
    neg_sampling_ratio=1.0,
    disjoint_train_ratio=0.3, #  training edges are shared for message passing and supervision
    )

seed_everything(14)
# sort by col to speed up sampling later (we can sepcify is_sorted=True in link neighbor loader)
def sort_edges(data):
    for edge_type in data.edge_types:
        if 'edge_attr' in data[edge_type].keys():
            data[edge_type].edge_index, data[edge_type].edge_attr = sort_edge_index(data[edge_type].edge_index, data[edge_type].edge_attr, sort_by_row=False) 
        else:
            data[edge_type].edge_index = sort_edge_index(data[edge_type].edge_index, sort_by_row=False) 
    return data
        
train_data, val_data, test_data = transform(data)
train_data = sort_edges(train_data)
val_data = sort_edges(val_data)
test_data = sort_edges(test_data)


# train_data = add_self_loops(train_data)
# val_data = add_self_loops(val_data)
# test_data = add_self_loops(test_data)

# train_data
    

In [143]:
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.loader import HGTLoader
from torch_geometric.sampler import NegativeSampling


    # num_neighbors['qualifications', 'self_loops', 'qualifications'] = [1,0]
    # num_neighbors['qualifications', 'rev_qualification_skill', 'skills'] = [10,0]


num_workers = 0
# delete edge_attr of every edge type
for edge_type in train_data.edge_types:
    del train_data[edge_type].edge_attr 

# delete all keys for every node type except 'x' (e.g. description and title)
for node_type in train_data.node_types:
    keys = list(train_data[node_type].keys())
    for key in keys:
        if key != 'x':
            del train_data[node_type][key]


def get_hgt_linkloader(data, target_edge, batch_size, is_training):
    
    num_neighbors_linkloader = {}
    for edge_type in data.edge_types:
        num_neighbors_linkloader[edge_type] = [0,0]
    
    negative_sampling = NegativeSampling(
        mode='binary',
        amount=10  # ratio, like Graphsage
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )
    
    linkNeighborLoader = LinkNeighborLoader(
            data,
            num_neighbors=num_neighbors_linkloader,
            edge_label_index=(target_edge, data[target_edge].edge_label_index), # if (edge, None), None means all edges are considered
        
            neg_sampling=negative_sampling, # adds negative samples
            batch_size=batch_size,
            shuffle=is_training, #is_training
            #drop_last=True,
            num_workers=num_workers,
            directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
            #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
            pin_memory=True, # faster data transfer to gpu
            #num_workers=2,
            #prefetch_factor=2
            is_sorted = True
    )
   
    #num_neighbors_hgtloader = {}
    #for node_type in data.node_types:
    #    num_neighbors_hgtloader[node_type] = [5,5]
    num_neighbors_hgtloader = [batch_size,batch_size]
    # sample same amount of neighbors of each edge type
    def get_hgt(data, input_nodetype, input_mask):
        return next(iter(HGTLoader(
                data,
                # Sample 512 nodes per type and per iteration for 4 iterations
                num_samples=num_neighbors_hgtloader,
                batch_size=batch_size,
                input_nodes=(input_nodetype, input_mask),
            )))
        
    def remove_edge_label_and_index(data):
        for edge_type in data.edge_types:
            del data[edge_type].edge_label            
            del data[edge_type].edge_label_index
            
        return data
            
    def get_hgt_with_selfloops(loader):
        for batch in loader:
            print('aa', data[target_edge].edge_index)
            
            
            
            original_indices = batch[target_edge[0]].n_id[batch[target_edge].edge_label_index.flatten()]
            original_indices = torch.LongTensor(list(set(original_indices)))
            
            print(list(sorted(list(set(original_indices))))[:10])
            hgt_batch = get_hgt(data, target_edge[0], original_indices)
            hgt_batch = remove_edge_label_and_index(hgt_batch)
            # remove the supervision edges from edge_index and add them to edge_label_index, edge_label respectively
            keep_mask = ~ hgt_batch[target_edge].e_id.isin(batch[target_edge].e_id)
            if 'edge_attr' in hgt_batch[target_edge].keys():
                
            
            yield add_self_loops(hgt_batch)

    def get_hgt_2types_with_selfloops(loader):
        for batch in loader:
            batch = remove_edge_label_and_index(batch, target_edge)
            print('del', batch)
            original_indices1 = batch[target_edge[0]].n_id[batch[target_edge].edge_label_index[0,:]]
            original_indices2 = batch[target_edge[2]].n_id[batch[target_edge].edge_label_index[1,:]]
            
            hgt_batch1 = get_hgt(data, target_edge[0], original_indices1)
            hgt_batch2 = get_hgt(data, target_edge[0], original_indices2)
            yield (hgt_batch1, hgt_batch2)
            # merge batches
            #for edge_type in hgt_batch2.edge_types:
            #    hgt_batch1[edge_type] = hgt_batch1[edge_type] + hgt_batch2[edge_type]
            #yield add_self_loops(hgt_batch)
    

    
        
    if target_edge[0] == target_edge[2]:
        # same edge type, only need to sample once
        return get_hgt_with_selfloops(linkNeighborLoader)
    else:
        return get_hgt_2types_with_selfloops(linkNeighborLoader)
    



def add_self_loops(data):
    for node_type in data.node_types:
        data[node_type, 'self_loop', node_type].edge_index = torch.arange(data[node_type].num_nodes).repeat(2,1)
    return data 
# add a yield wrapper


In [55]:
negative_sampling = NegativeSampling(
        mode='binary',
        amount=10  # ratio, like Graphsage
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )
    
num_neighbors_linkloader = {}
for edge_type in train_data.edge_types:
        num_neighbors_linkloader[edge_type] = [4,2]
linkNeighborLoader = LinkNeighborLoader(
            train_data,
            num_neighbors=num_neighbors_linkloader,
            edge_label_index=(('jobs', 'job_job', 'jobs'), train_data[('jobs', 'job_job', 'jobs')].edge_label_index), # if (edge, None), None means all edges are considered
        
            neg_sampling=negative_sampling, # adds negative samples
            batch_size=64,
            shuffle=False,
            #drop_last=True,
            num_workers=0,
            directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
            #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
            pin_memory=True, # faster data transfer to gpu
            #num_workers=2,
            #prefetch_factor=2
    )

a = next(iter(linkNeighborLoader))

In [48]:
e =  ('courses_and_programs', 'course_and_programs_student', 'people')
a[e].edge_index[1].max(), a[e].edge_label_index[1].max()

(tensor(345), tensor(293417))

In [49]:
train_data[('skills', 'course_and_program_skill', 'courses_and_programs')]

{'edge_index': tensor([[ 20803, 112003,  38177,  ...,  64334, 121204, 136775],
        [     0,      0,      1,  ...,  55795,  55795,  55795]]), 'edge_label': tensor([1., 1., 1.,  ..., 1., 1., 1.]), 'edge_label_index': tensor([[104219,  74588,  46882,  ..., 134506,  15554, 133986],
        [ 21173,  39198,  34381,  ...,  21283,   1899,   4335]])}

In [145]:
loader1 = get_hgt_linkloader(train_data, ('jobs', 'job_job', 'jobs'), 128, is_training=True) 

In [146]:
a = next(iter(loader1))

aa tensor([[54625, 54632, 54633,  ..., 55603, 55605, 55606],
        [54624, 54624, 54624,  ..., 55617, 55617, 55617]])
[tensor(15), tensor(20), tensor(45), tensor(74), tensor(213), tensor(226), tensor(227), tensor(264), tensor(272), tensor(283)]


In [147]:
a

HeteroData(
  courses_and_programs={
    x=[128, 814],
    n_id=[128],
  },
  qualifications={
    x=[8, 785],
    n_id=[8],
  },
  skills={
    x=[256, 772],
    n_id=[256],
  },
  people={
    x=[243, 24],
    n_id=[243],
  },
  jobs={
    x=[384, 773],
    n_id=[384],
    input_id=[128],
    batch_size=128,
  },
  organizations={
    x=[33, 2],
    n_id=[33],
  },
  (skills, qualification_skill, qualifications)={
    edge_index=[2, 14],
    e_id=[14],
  },
  (skills, course_and_program_skill, courses_and_programs)={
    edge_index=[2, 129],
    e_id=[129],
  },
  (courses_and_programs, course_qualification, qualifications)={
    edge_index=[2, 0],
    e_id=[0],
  },
  (courses_and_programs, course_and_programs_student, people)={
    edge_index=[2, 108],
    e_id=[108],
  },
  (jobs, job_student, people)={
    edge_index=[2, 159],
    e_id=[159],
  },
  (people, supervisor_supervisee, people)={
    edge_index=[2, 115],
    e_id=[115],
  },
  (people, organization_student, organizatio

In [132]:
a

HeteroData(
  courses_and_programs={
    x=[128, 814],
    n_id=[128],
  },
  qualifications={
    x=[6, 785],
    n_id=[6],
  },
  skills={
    x=[256, 772],
    n_id=[256],
  },
  people={
    x=[130, 24],
    n_id=[130],
  },
  jobs={
    x=[384, 773],
    n_id=[384],
    input_id=[128],
    batch_size=128,
  },
  organizations={
    x=[15, 2],
    n_id=[15],
  },
  (skills, qualification_skill, qualifications)={
    edge_index=[2, 6],
    edge_label=[460],
    edge_label_index=[2, 460],
    e_id=[6],
  },
  (skills, course_and_program_skill, courses_and_programs)={
    edge_index=[2, 188],
    edge_label=[74333],
    edge_label_index=[2, 74333],
    e_id=[188],
  },
  (courses_and_programs, course_qualification, qualifications)={
    edge_index=[2, 0],
    edge_label=[605],
    edge_label_index=[2, 605],
    e_id=[0],
  },
  (courses_and_programs, course_and_programs_student, people)={
    edge_index=[2, 49],
    edge_label=[159394],
    edge_label_index=[2, 159394],
    e_id=[49],

In [117]:
loader = get_hgt_linkloader(train_data, ('skills', 'course_and_program_skill', 'courses_and_programs'), 512, True) 

In [118]:
u,i = next(iter(loader))

del HeteroData(
  courses_and_programs={
    x=[5353, 814],
    n_id=[5353],
    num_sampled_nodes=[3],
  },
  qualifications={
    x=[0, 785],
    n_id=[0],
    num_sampled_nodes=[3],
  },
  skills={
    x=[5469, 772],
    n_id=[5469],
    num_sampled_nodes=[3],
  },
  people={
    x=[0, 24],
    n_id=[0],
    num_sampled_nodes=[3],
  },
  jobs={
    x=[0, 773],
    n_id=[0],
    num_sampled_nodes=[3],
  },
  organizations={
    x=[0, 2],
    n_id=[0],
    num_sampled_nodes=[3],
  },
  (skills, qualification_skill, qualifications)={
    edge_index=[2, 0],
    e_id=[0],
    num_sampled_edges=[2],
  },
  (skills, course_and_program_skill, courses_and_programs)={
    edge_index=[2, 0],
    edge_label=[5632],
    edge_label_index=[2, 5632],
    e_id=[0],
    num_sampled_edges=[2],
    input_id=[512],
  },
  (courses_and_programs, course_qualification, qualifications)={
    edge_index=[2, 0],
    e_id=[0],
    num_sampled_edges=[2],
  },
  (courses_and_programs, course_and_programs_student

In [None]:
u['qualifications']

In [None]:
i['qualifications']

In [None]:
def merge(data1, data2):
    # merge hetero data objects, based on real 

In [122]:
u

HeteroData(
  courses_and_programs={
    x=[1024, 814],
    n_id=[1024],
  },
  qualifications={
    x=[38, 785],
    n_id=[38],
  },
  skills={
    x=[1024, 772],
    n_id=[1024],
    input_id=[512],
    batch_size=512,
  },
  people={
    x=[512, 24],
    n_id=[512],
  },
  jobs={
    x=[1024, 773],
    n_id=[1024],
  },
  organizations={
    x=[0, 2],
    n_id=[0],
  },
  (skills, qualification_skill, qualifications)={
    edge_index=[2, 116],
    edge_label=[460],
    edge_label_index=[2, 460],
    e_id=[116],
  },
  (skills, course_and_program_skill, courses_and_programs)={
    edge_index=[2, 5001],
    edge_label=[74333],
    edge_label_index=[2, 74333],
    e_id=[5001],
  },
  (courses_and_programs, course_qualification, qualifications)={
    edge_index=[2, 31],
    edge_label=[605],
    edge_label_index=[2, 605],
    e_id=[31],
  },
  (courses_and_programs, course_and_programs_student, people)={
    edge_index=[2, 293],
    edge_label=[159394],
    edge_label_index=[2, 159394],

In [121]:
i

HeteroData(
  courses_and_programs={
    x=[543, 814],
    n_id=[543],
  },
  qualifications={
    x=[4, 785],
    n_id=[4],
  },
  skills={
    x=[1024, 772],
    n_id=[1024],
    input_id=[512],
    batch_size=512,
  },
  people={
    x=[512, 24],
    n_id=[512],
  },
  jobs={
    x=[1024, 773],
    n_id=[1024],
  },
  organizations={
    x=[0, 2],
    n_id=[0],
  },
  (skills, qualification_skill, qualifications)={
    edge_index=[2, 16],
    edge_label=[460],
    edge_label_index=[2, 460],
    e_id=[16],
  },
  (skills, course_and_program_skill, courses_and_programs)={
    edge_index=[2, 3119],
    edge_label=[74333],
    edge_label_index=[2, 74333],
    e_id=[3119],
  },
  (courses_and_programs, course_qualification, qualifications)={
    edge_index=[2, 3],
    edge_label=[605],
    edge_label_index=[2, 605],
    e_id=[3],
  },
  (courses_and_programs, course_and_programs_student, people)={
    edge_index=[2, 320],
    edge_label=[159394],
    edge_label_index=[2, 159394],
    e_i

In [None]:
input_nodetype = ('jobs', 'job_job', 'jobs')
seed_everything(14)
next(iter(HGTLoader(
                train_data,
                # Sample 512 nodes per type and per iteration for 4 iterations
                num_samples=[10,10],
                batch_size=64,
                input_nodes=('jobs', None),
)))