In [1]:
# Sampling
# 1. Sample using HGT Sampler as outlined in the paper, using pyg implementations
# 2. The sampling is adapted to link prediction, by first sampling random supervision edges of which the nodes create the supervision nodes
# a. Dataset is divided across multiple dimensions:
#   a.1. Split into Train, Val, Test split (96, 2, 2)
#   a.2. Training only: Edges are split into those which are used solely for message passing and those solely used for supervision (80, 20). 
#        Because an expressive model (HGT) is used, this prevents the model from memorizing supervision edges by their appearance as message passing edges
#   a.3. This means Training consists of 96%*80% Message passing Edges, 96%*20% supervision edges, Val contains 2% Supervision Edges, Test contains 2% supervison Edges
#   a.4. Validation and Test edges use the Training Message passing Edges as well.
# b. For mini-batch sampling in the training phase, first x random edges are sampled as supervision edges. 
#    For the nodes of these supervision edges, we apply batch-wise HGT Sampling. Due to implementation limitations, for each supervision entity type, the hgt sampling is separate. 
#    This limitation does not apply for sampled neighbor entity types
# during sampling, also the reverse edge of the supervision edge is removed to avoid data leakage


# HGT Sampler (See Paper for further reference)
# The probablity of a neighbor node s to be sampled depends on the normalized degree of all its edge types connecting it to all source nodes
# If neighbor node s is connected to a and b by edge type r, and a has 2 neighbors through edge type r and b has 1 neighbor (node s) through edge type r, 
# then the sampling probablity of s is (1/2+1)**2 / 2**2, if it were connected through other edge types to the nodes as well, those degrees would be added to the numerator and denominator.
# Nodes are sampled without replacement.
# This sampling strategy creates more dense mini-batches, because neighbor nodes which are connected to multiple source nodes and by multiple relationship types are sampled more frequently.
# Therefore, training is sped up since less node representations have to be computed. Furthermore, as stated in the paper, the sampling method allows to sample a 
# similar number of neighbors for each edge type, because high-count edge types and low-count edge types are weighted equally. For each neighbor node type T, a fixed number n of nodes is sampled.




In [2]:
import os
import torch
from torch_geometric.data import HeteroData


filename = 'HeteroData_Learnings_v1.pt'
if os.path.exists('./'+filename):
    data = HeteroData.from_dict(torch.load('./'+filename))
    print('loading saved heterodata object')

  from .autonotebook import tqdm as notebook_tqdm


loading saved heterodata object


In [3]:
# get size of the data on disk in gb
import os
size = os.path.getsize('./'+filename)
print('size of file on disk: ', size/1e9, 'gb')

size of file on disk:  2.27811871 gb


In [4]:
# sampler for Heterogeneous Graph Transformer

In [5]:
# for each node type, add a new edge type only consisting of self loops
# this is done to allow HGT to attend to the previous node representations
# for node_type in data.node_types:
#     data[node_type, 'self_loop', node_type] = torch.cat((torch.arange(data[node_type].num_nodes),torch.arange(data[node_type].num_nodes)), dim=0)

In [6]:
# split

In [7]:
from torch_geometric import seed_everything
import torch_geometric.transforms as T
from torch_geometric.utils import sort_edge_index

edge_types = []
rev_edge_types = []
for edge_type in data.edge_types:
    if edge_type[1].startswith('rev_'):
        rev_edge_types.append(edge_type)
    else:
        edge_types.append(edge_type)

transform = T.RandomLinkSplit(
    is_undirected=True,
    edge_types=edge_types,
    rev_edge_types=rev_edge_types,
    num_val=0.02,
    num_test=0.02,
    add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
    neg_sampling_ratio=1.0,
    disjoint_train_ratio=0.3, #  training edges are shared for message passing and supervision
    )

seed_everything(14)
# sort by col to speed up sampling later (we can sepcify is_sorted=True in link neighbor loader)
def sort_edges(data):
    for edge_type in data.edge_types:
        if 'edge_attr' in data[edge_type].keys():
            data[edge_type].edge_index, data[edge_type].edge_attr = sort_edge_index(data[edge_type].edge_index, data[edge_type].edge_attr, sort_by_row=False) 
        else:
            data[edge_type].edge_index = sort_edge_index(data[edge_type].edge_index, sort_by_row=False) 
    return data
        
train_data, val_data, test_data = transform(data)
train_data = sort_edges(train_data)
val_data = sort_edges(val_data)
test_data = sort_edges(test_data)


# train_data = add_self_loops(train_data)
# val_data = add_self_loops(val_data)
# test_data = add_self_loops(test_data)

# train_data
    

In [26]:


from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.loader import HGTLoader
from torch_geometric.sampler import NegativeSampling
from copy import deepcopy

    # num_neighbors['qualifications', 'self_loops', 'qualifications'] = [1,0]
    # num_neighbors['qualifications', 'rev_qualification_skill', 'skills'] = [10,0]


num_workers = 0
# delete edge_attr of every edge type
for edge_type in train_data.edge_types:
    del train_data[edge_type].edge_attr 

# delete all keys for every node type except 'x' (e.g. description and title)
for node_type in train_data.node_types:
    keys = list(train_data[node_type].keys())
    for key in keys:
        if key != 'x':
            del train_data[node_type][key]


def get_hgt_linkloader(data, target_edge, batch_size, is_training, sampling_mode, neg_ratio, num_neighbors_hgtloader):
    # first sample some edges in linkNeighborLoader
    # use the nodes of the sampled edges to sample from hgt loader
    
    
    num_neighbors_linkloader = [0]
    #for edge_type in data.edge_types:
    #    num_neighbors_linkloader[edge_type] = [0,0]
    
    negative_sampling = NegativeSampling(
        mode=sampling_mode, # binary or triplet
        amount=neg_ratio  # ratio, like Graphsage # 10
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )
    
    if sampling_mode == 'triplet':
        data[target_edge].edge_label = None

    linkNeighborLoader = LinkNeighborLoader(
            data,
            num_neighbors=num_neighbors_linkloader,
            edge_label_index=(target_edge, data[target_edge].edge_label_index), # if (edge, None), None means all edges are considered
        
            neg_sampling=negative_sampling, # adds negative samples
            batch_size=batch_size,
            shuffle=is_training, #is_training
            #drop_last=True,
            num_workers=num_workers,
            directed=False,  # True contains only edges which are followed, False: contains full node induced subgraph, we want false so we can later filter out the reverse edges as well
            #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
            pin_memory=True, # faster data transfer to gpu
            #num_workers=2,
            #prefetch_factor=2
            is_sorted = False
    )
    
   
    #num_neighbors_hgtloader = {}
    #for node_type in data.node_types:
    #    num_neighbors_hgtloader[node_type] = [5,5]
    #num_neighbors_hgtloader = [batch_size,batch_size]
    # sample same amount of neighbors of each edge type
    def get_hgt(data, input_nodetype, input_mask):
        return next(iter(HGTLoader(
                data,
                # Sample 512 nodes per type and per iteration for 4 iterations
                num_samples=num_neighbors_hgtloader,
                batch_size=input_mask.shape[0],
                input_nodes=(input_nodetype, input_mask),
            )))
        
    
    def add_self_loops(data):
        for node_type in data.node_types:
            data[node_type, 'self_loop', node_type].edge_index = torch.arange(data[node_type].num_nodes).repeat(2,1)
        return data 

            
    def get_hgt_with_selfloops(loader):
        
        
        for batch in loader:
            if sampling_mode=='triplet':      
                # original edge_label_index from the whole data object
                unmapped_batchids = torch.cat((batch[target_edge[0]].src_index,batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index)).unique()
                original_edge_label_nodes = torch.LongTensor(batch[target_edge[0]].n_id[unmapped_batchids])

                src = batch[target_edge[0]].n_id[batch[target_edge[0]].src_index].unsqueeze(0)
                dst = batch[target_edge[2]].n_id[torch.cat((batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index),dim=0)].unsqueeze(0)
                global_edge_label_index = torch.cat((src, dst),dim=0)
                edge_label = torch.cat((torch.ones(batch[target_edge[2]].dst_pos_index.shape[1]), torch.zeros(batch[target_edge[2]].dst_neg_index)))
                
            elif sampling_mode=='binary':
                unmapped_batchids = batch[target_edge].edge_label_index.flatten().unique()
                original_edge_label_nodes = torch.LongTensor(batch[target_edge[0]].n_id[unmapped_batchids])
                
                src = batch[target_edge[0]].n_id[batch[target_edge].edge_label_index[0,:]].unsqueeze(0)
                dst = batch[target_edge[2]].n_id[batch[target_edge].edge_label_index[1,:]].unsqueeze(0)
                global_edge_label_index = torch.cat((src, dst),dim=0)
            else:
                raise Exception('binary or triplet sampling mode')
                
                
            hgt_batch = get_hgt(data, target_edge[0], original_edge_label_nodes) # 0,1,3,4,5,6,7,8,9,
            # ** We dont need to remove any edges ** since the supervision edges wont be sampled by hgt
            # remove the supervision edges and their reverse from edge_index

         
                
            src = (hgt_batch[target_edge[0]].n_id.unsqueeze(0) == global_edge_label_index[0,:].unsqueeze(1)).nonzero()[:,1].unsqueeze(0) 
            dst = (hgt_batch[target_edge[2]].n_id.unsqueeze(0) == global_edge_label_index[1,:].unsqueeze(1)).nonzero()[:,1].unsqueeze(0) 

            local_edge_level_index = torch.cat((src, dst),dim=0)
            if sampling_mode=='triplet':
                
                
                #src = batch[target_edge[0]].src_index.unsqueeze(0)
                #pos_edge_label_index = torch.cat((src, batch[target_edge[0]].dst_pos_index.unsqueeze(0)), dim=0)
                #neg_edge_label_index = torch.cat((src, batch[target_edge[0]].dst_neg_index.unsqueeze(0)), dim=0)
                #edge_label_index = torch.cat((pos_edge_label_index,neg_edge_label_index), dim=1)
                #edge_label = torch.cat((torch.ones(pos_edge_label_index.shape[1]), torch.zeros(neg_edge_label_index.shape[1])))

                # I think below is the incorrect way:
                #src = batch[target_edge[0]].n_id[batch[target_edge[0]].src_index].unsqueeze(0)
                #pos_edge_label_index = torch.cat((src, batch[target_edge[0]].n_id[batch[target_edge[0]].dst_pos_index].unsqueeze(0)), dim=0)
                #neg_edge_label_index = torch.cat((src, batch[target_edge[0]].n_id[batch[target_edge[0]].dst_neg_index].unsqueeze(0)), dim=0)
                #edge_label_index = torch.cat((pos_edge_label_index,neg_edge_label_index), dim=1)
                #edge_label = torch.cat((torch.ones(pos_edge_label_index.shape[1]), torch.zeros(neg_edge_label_index.shape[1])))
                
                # return message passing edges, and supervision edges/labels, ignore labels/label_indices in the message passing edges
                yield add_self_loops(hgt_batch), local_edge_level_index, edge_label, batch[target_edge].input_id
            else: # sampling_mode=='binary':
                # return message passing edges, and supervision edges/labels, ignore labels/label_indices in the message passing edges, as well as original edge indices
                yield add_self_loops(hgt_batch), local_edge_level_index, batch[target_edge].edge_label, batch[target_edge].input_id
    
    def get_hgt_2types_with_selfloops(loader):
        for batch in loader:
            if sampling_mode=='triplet':   
                original_edge_label_index_class1 = torch.LongTensor(batch[target_edge[0]].n_id[batch[target_edge[0]].src_index.unique()])
                original_edge_label_index_class2 = torch.LongTensor(batch[target_edge[2]].n_id[torch.cat((batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index)).unique()])
                
                src = batch[target_edge[0]].n_id[batch[target_edge[0]].src_index].unsqueeze(0)
                dst = batch[target_edge[2]].n_id[torch.cat((batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index),dim=0)].unsqueeze(0)
                global_edge_label_index = torch.cat((src, dst),dim=0)
                edge_label = torch.cat((torch.ones(batch[target_edge[2]].dst_pos_index.shape[1]), torch.zeros(batch[target_edge[2]].dst_neg_index)))
            
            elif sampling_mode=='binary':
                original_edge_label_index_class1 = batch[target_edge[0]].n_id[batch[target_edge].edge_label_index[0,:].unique()]
                original_edge_label_index_class2 = batch[target_edge[2]].n_id[batch[target_edge].edge_label_index[1,:].unique()]

                src = batch[target_edge[0]].n_id[batch[target_edge].edge_label_index[0,:]].unsqueeze(0)
                dst = batch[target_edge[2]].n_id[batch[target_edge].edge_label_index[1,:]].unsqueeze(0)
                global_edge_label_index = torch.cat((src, dst),dim=0)
            else:
                raise Exception('binary or triplet sampling mode')

            # batch the start and end supervision nodes separately
            hgt_batch1 = get_hgt(data, target_edge[0], original_edge_label_index_class1)
            hgt_batch2 = get_hgt(data, target_edge[2], original_edge_label_index_class2)
            
            # ** We dont need to remove any edges ** since the supervision edges wont be sampled by hgt
            src = (hgt_batch1[target_edge[0]].n_id.unsqueeze(0) == global_edge_label_index[0,:].unsqueeze(1)).nonzero()[:,1].unsqueeze(0) 
            dst = (hgt_batch2[target_edge[2]].n_id.unsqueeze(0) == global_edge_label_index[1,:].unsqueeze(1)).nonzero()[:,1].unsqueeze(0) 
            local_edge_level_index = torch.cat((src, dst),dim=0)
            if sampling_mode=='triplet':
                src = batch[target_edge[0]].src_index.unsqueeze(0)
                pos_edge_label_index = torch.cat((src, batch[target_edge[2]].dst_pos_index.unsqueeze(0)), dim=0)
                neg_edge_label_index = torch.cat((src, batch[target_edge[2]].dst_neg_index.unsqueeze(0)), dim=0)
                edge_label_index = torch.cat((pos_edge_label_index,neg_edge_label_index), dim=1)
                edge_label = torch.cat((torch.ones(pos_edge_label_index.shape[1]), torch.zeros(neg_edge_label_index.shape[1])))
                
                add_self_loops(hgt_batch1), add_self_loops(hgt_batch2), local_edge_label_index, edge_label, batch[target_edge].input_id
            else: # sampling_mode=='binary':
                # we can access the corresponding nodes of edge_label_index[0,:] in hgt_batch1[target_edge[0]], those of [1,:] in hgt_batch2...
                yield add_self_loops(hgt_batch1), add_self_loops(hgt_batch2), local_edge_label_index, batch[target_edge].edge_label, batch[target_edge].input_id

    
        
    if target_edge[0] == target_edge[2]:
        # same edge type, only need to sample once
        return get_hgt_with_selfloops(linkNeighborLoader)
    else:
        return get_hgt_2types_with_selfloops(linkNeighborLoader)


In [28]:
# testing
input_edgetype = ('jobs', 'job_job', 'jobs')
loader = get_hgt_linkloader(train_data, input_edgetype, 4, True, 'binary', 1, [10])
minibatch, edge_label_index, edge_label, input_edge_ids = next(iter(loader))

#input_nodetype = ('skills', 'qualification_skill', 'qualifications')
#loader = get_hgt_linkloader(train_data, input_nodetype, 8, True, 'triplet', 1, [10])
#minibatchpart1, minibatchpart2, edge_label_index, edge_label, input_edge_id = next(iter(loader))
#input_edge_id


In [None]:
x = torch.tensor([34265,234234,235325,32434,546546])
y = torch.tensor([34265,234234,546546,34265, 34265])
torch.argwhere(torch.isin(y,x))


In [None]:
set([str(x) for x in list(train_data[('skills', 'qualification_skill', 'qualifications')].edge_index.T.numpy().tolist())]).intersection(set([str(x) for x in list(train_data[('skills', 'qualification_skill', 'qualifications')].edge_label_index.T.numpy().tolist())]))

In [None]:
import torch

# initial tensors
tensor1 = torch.tensor([1, 2, 3, 4])
tensor2 = torch.tensor([2, 2, 3, 1])

# sort tensor1
sorted_tensor1, indices = tensor1.sort()

# use searchsorted to find the indices
sorted_indices = sorted_tensor1.searchsorted(tensor2)

# index into original indices to get the indices in the original unsorted tensor
result = indices[sorted_indices]

print(result)  # prints tensor([1, 1, 2, 0])

In [None]:
a = torch.tensor([13,152,1223])
b = torch.tensor([152,13, 1223, 1223, 13,13])
c = torch.tensor([152,13, 13, 1223, 13,13])

(a.unsqueeze(0) == b.unsqueeze(1)).nonzero()[:,1]
(a.unsqueeze(0) == c.unsqueeze(1)).nonzero()[:,1]

In [None]:
c.unsqueeze(1).shape

In [None]:
b.unsqueeze(1)

In [None]:
train_data[('skills', 'qualification_skill', 'qualifications')].edge_label_index[:,344]

In [None]:
u = a[input_nodetype[0]].n_id[a[input_nodetype].edge_index[0,:]]
v = a[input_nodetype[2]].n_id[a[input_nodetype].edge_index[1,:]]
a[input_nodetype]
a[input_nodetype]

In [None]:
a,b, edge_label_index, edge_label  = next(iter(loader)) # 229512 304

In [None]:
train_data['skills', 'qualification_skill', 'qualifications'].edge_index[:,820]

In [None]:
a['skills', 'qualification_skill', 'qualifications']

In [None]:
a

In [None]:
a['qualifications'].dst_pos_index

In [None]:
a['qualifications'].dst_neg_index

In [None]:
a['qualifications'].n_id[46]

In [None]:
data = train_data
target_edge=input_nodetype
num_neighbors_linkloader = {}
for edge_type in data.edge_types:
    num_neighbors_linkloader[edge_type] = [0,0]

#data[target_edge].edge_label = None
negative_sampling = NegativeSampling(
    mode='binary', # binary or triplet
    amount=1  # ratio, like Graphsage # 10
    #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
    )


linkNeighborLoader = LinkNeighborLoader(
        data,
        num_neighbors=num_neighbors_linkloader,
        edge_label_index=(target_edge, data[target_edge].edge_label_index), # if (edge, None), None means all edges are considered
    
        neg_sampling=negative_sampling, # adds negative samples
        batch_size=28,
        shuffle=True, #is_training
        #drop_last=True,
        num_workers=0,
        directed=False,  # True contains only edges which are followed, False: contains full node induced subgraph, we want false so we can later filter out the reverse edges as well
        #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
        pin_memory=True, # faster data transfer to gpu
        #num_workers=2,
        #prefetch_factor=2
        is_sorted = False
)

In [None]:
a = next(iter(linkNeighborLoader))

In [None]:
a

In [None]:
train_data['skills', 'qualification_skill', 'qualifications'].edge_index[:,202]


In [None]:
train_data['skills', 'qualification_skill', 'qualifications'].edge_index[:,214]

In [None]:
set(a['qualifications'].dst_neg_index.numpy()).intersection(set(a['qualifications'].dst_pos_index.numpy()))

In [None]:
a['qualifications'].x.shape

In [None]:
a['qualifications'].dst_neg_index.max(), a['qualifications'].dst_pos_index.max()

In [None]:
a['qualifications'].n_id.shape

In [None]:
a['qualifications']

In [None]:
a['qualifications'].n_id[45]

In [None]:
a['skills']

In [None]:
a['skills'].n_id[11]

In [None]:
ax = train_data['skills', 'qualification_skill', 'qualifications'].edge_index
ax[0,ax[1]==881]


In [None]:
a

In [None]:
data = train_data
target_edge = 'skills', 'qualification_skill', 'qualifications'
num_neighbors_linkloader = {}
for edge_type in data.edge_types:
    num_neighbors_linkloader[edge_type] = [0,0]

negative_sampling = NegativeSampling(
    mode='triplet', # binary
    amount=1  
    #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
    )

data[target_edge].edge_label = None
linkNeighborLoader = LinkNeighborLoader(
        data,
        num_neighbors=num_neighbors_linkloader,
        edge_label_index=(target_edge, data[target_edge].edge_label_index), 
        neg_sampling=negative_sampling, 
        batch_size=32,
        shuffle=True, 
        num_workers=0,
        directed=False,  
        pin_memory=True, 
        is_sorted = True
)

In [None]:
a = next(iter(linkNeighborLoader))

In [None]:
a['qualifications'].n_id[a['qualifications'].dst_pos_index[0]], a['skills'].n_id[a['skills'].src_index[0]]

In [None]:
a['skills', 'qualification_skill', 'qualifications'].input_id

In [None]:
data[target_edge].edge_label_index[:,221]