In [1]:
# Mean Reciprocal Rank

In [2]:
%load_ext autoreload
%autoreload 2

In [2]:
! pip install torch==2.1.0  torchvision==0.16.0 torchtext==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
#! pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html # torch_spline_conv
! pip install torch_geometric
! pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
#! pip install torch_sparse -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
#! pip install torch_scatter -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
#! pip install pyg_lib -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
! pip install sentence-transformers
! pip install torcheval
! pip install matplotlib
! pip install pandas
! pip install tensorboard
! pip install weaviate-client

! pip install -U pip setuptools wheel
! pip install -U spacy
! python -m spacy download en_core_web_sm

HGT

In [None]:
from torch_geometric.nn import HGTConv, Linear
import torch 

class HGT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads, num_layers, node_types, data_metadata):
        super().__init__()

        self.lin_dict = torch.nn.ModuleDict()
        for node_type in node_types:
            self.lin_dict[node_type] = Linear(-1, hidden_channels)

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HGTConv(hidden_channels, hidden_channels, data_metadata,
                           num_heads, group='sum')
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = {
            node_type: self.lin_dict[node_type](x).relu_()
            for node_type, x in x_dict.items()
        }

        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)

        return x_dict
    
if __name__ == '__main__':
    model = HGT(hidden_channels=64, out_channels=4, num_heads=2, num_layers=1, node_types=data.node_types, data_metadata=data.metadata())

TransE

In [None]:
from torch_geometric.nn.kge import TransE

import math

import torch
import torch.nn.functional as F
from torch import Tensor

from torch_geometric.nn.kge import KGEModel

# adapted and taken from https://github.com/pyg-team/pytorch_geometric/blob/master/torch_geometric/nn/kge/transe.py

class TransE(KGEModel):
    r"""The TransE model from the `"Translating Embeddings for Modeling
    Multi-Relational Data" <https://proceedings.neurips.cc/paper/2013/file/
    1cecc7a77928ca8133fa24680a88d2f9-Paper.pdf>`_ paper.

    :class:`TransE` models relations as a translation from head to tail
    entities such that

    .. math::
        \mathbf{e}_h + \mathbf{e}_r \approx \mathbf{e}_t,

    resulting in the scoring function:

    .. math::
        d(h, r, t) = - {\| \mathbf{e}_h + \mathbf{e}_r - \mathbf{e}_t \|}_p

    .. note::

        For an example of using the :class:`TransE` model, see
        `examples/kge_fb15k_237.py
        <https://github.com/pyg-team/pytorch_geometric/blob/master/examples/
        kge_fb15k_237.py>`_.

    Args:
        num_nodes (int): The number of nodes/entities in the graph.
        num_relations (int): The number of relations in the graph.
        hidden_channels (int): The hidden embedding size.
        margin (int, optional): The margin of the ranking loss.
            (default: :obj:`1.0`)
        p_norm (int, optional): The order embedding and distance normalization.
            (default: :obj:`1.0`)
        sparse (bool, optional): If set to :obj:`True`, gradients w.r.t. to the
            embedding matrices will be sparse. (default: :obj:`False`)
    """
    def __init__(
        self,
        num_nodes: int,
        num_relations: int,
        hidden_channels: int,
        margin: float = 1.0,
        p_norm: float = 1.0,
        sparse: bool = False,
    ):
        super().__init__(num_nodes, num_relations, hidden_channels, sparse)

        self.p_norm = p_norm
        self.margin = margin

        self.reset_parameters()

    def reset_parameters(self):
        bound = 6. / math.sqrt(self.hidden_channels)
        torch.nn.init.uniform_(self.node_emb.weight, -bound, bound)
        torch.nn.init.uniform_(self.rel_emb.weight, -bound, bound)
        F.normalize(self.rel_emb.weight.data, p=self.p_norm, dim=-1,
                    out=self.rel_emb.weight.data)

    def forward(
        self,
        head_embeddings: Tensor,
        rel_type,
        tail_embeddings: Tensor,
    ) -> Tensor:
        #head = self.node_emb(head_index)
        rel = self.rel_emb(rel_type)  # Amos: only learn the relation embeddings, others are learned with GNN
        #tail = self.node_emb(tail_index)

        head = F.normalize(head_embeddings, p=self.p_norm, dim=-1)
        tail = F.normalize(tail_embeddings, p=self.p_norm, dim=-1)

        # Calculate *negative* TransE norm:
        return -((head + rel) - tail).norm(p=self.p_norm, dim=-1)

    
    def get_embedding(self,
                      embedding,
                      rel_type,
                        have_head_or_tail
                      ):
        rel = self.rel_emb(rel_type)
        embedding = F.normalize(embedding, p=self.p_norm, dim=-1)
        if have_head_or_tail == 'head':
            return embedding + rel
        else:
            return embedding - rel
    
    
    def loss(
        self,
        head_embeddings: Tensor,
        rel_type: Tensor,
        tail_embeddings: Tensor,
        labels: Tensor, # labels 0 or 1
    ) -> Tensor:
        pos_mask = labels == 1
        neg_mask = labels == 0
        
        pos_score = self(head_embeddings[pos_mask], rel_type, tail_embeddings[pos_mask])
        neg_score = self(head_embeddings[neg_mask], rel_type, tail_embeddings[neg_mask])

        return F.margin_ranking_loss(
            pos_score,
            neg_score,
            target=torch.ones_like(pos_score), # 1 for similarity, -1 for dissimilarity
            margin=self.margin,
        )

In [None]:
# GraphSampler 
# COMMAND ----------
# Sampling
# 1. Sample using HGT Sampler as outlined in the paper, using pyg implementations
# 2. The sampling is adapted to link prediction, by first sampling random supervision edges of which the nodes create the supervision nodes
# a. Dataset is divided across multiple dimensions:
#   a.1. Split into Train, Val, Test split (96, 2, 2)
#   a.2. Training only: Edges are split into those which are used solely for message passing and those solely used for supervision (80, 20). 
#        Because an expressive model (HGT) is used, this prevents the model from memorizing supervision edges by their appearance as message passing edges
#   a.3. This means Training consists of 96%*80% Message passing Edges, 96%*20% supervision edges, Val contains 2% Supervision Edges, Test contains 2% supervison Edges
#   a.4. Validation and Test edges use the Training Message passing Edges as well.
# b. For mini-batch sampling in the training phase, first x random edges are sampled as supervision edges. 
#    For the nodes of these supervision edges, we apply batch-wise HGT Sampling. Due to implementation limitations, for each supervision entity type, the hgt sampling is separate. 
#    This limitation does not apply for sampled neighbor entity types
# during sampling, also the reverse edge of the supervision edge is removed to avoid data leakage


# HGT Sampler (See Paper for further reference)
# The probablity of a neighbor node s to be sampled depends on the normalized degree of all its edge types connecting it to all source nodes
# If neighbor node s is connected to a and b by edge type r, and a has 2 neighbors through edge type r and b has 1 neighbor (node s) through edge type r, 
# then the sampling probablity of s is (1/2+1)**2 / 2**2, if it were connected through other edge types to the nodes as well, those degrees would be added to the numerator and denominator.
# Nodes are sampled without replacement.
# This sampling strategy creates more dense mini-batches, because neighbor nodes which are connected to multiple source nodes and by multiple relationship types are sampled more frequently.
# Therefore, training is sped up since less node representations have to be computed. Furthermore, as stated in the paper, the sampling method allows to sample a 
# similar number of neighbors for each edge type, because high-count edge types and low-count edge types are weighted equally. For each neighbor node type T, a fixed number n of nodes is sampled.





import os
import torch
from torch_geometric.data import HeteroData
from tqdm.auto import tqdm
import numpy as np
def add_reverse_edge_original_attributes_and_label_inplace(original_edge, reverse_edge):
    # add edge label and index and edge attr to reverse edge 
    if 'edge_attr' in original_edge:
        reverse_edge['edge_attr'] = original_edge['edge_attr']
    reverse_edge['edge_label'] = original_edge['edge_label']
    reverse_edge['edge_label_index'] = original_edge['edge_label_index'].index_select(0, torch.LongTensor([1, 0]))
    for key in original_edge.keys():
        if key not in ['edge_index', 'edge_attr', 'edge_label', 'edge_label_index']:
            reverse_edge[key] = original_edge[key]
                
    return reverse_edge

def get_datasets(get_edge_attr=False, filename=None, filter_top_k=False, top_k=50, remove_text_attr=True):
    if filename is None:
        filename = 'HeteroData_Learnings_normalized_triangles_withadditionaldata_v1.pt'
    size = os.path.getsize(filename)
    print('size of dataset on disk: ', size/1e9, 'gb')

    if os.path.exists(filename):
        data = HeteroData.from_dict(torch.load(filename))
        print('loading saved heterodata object')


    
    def top_k_mask(scores, indices, top_k ):
        # Make sure we are using the GPU
        scores = scores.cuda()
        indices = indices.cuda()
        
        # Create an empty mask with the same shape as scores
        mask = torch.zeros_like(scores, dtype=torch.bool)
        # Get the unique indices and their counts
        unique_indices, counts = torch.unique(indices, return_counts=True)
    
        # Indices where count > top_k
        large_indices = unique_indices[counts > top_k]
    
        # Set mask for indices where count <= top_k
        mask[~torch.isin(indices,large_indices)] = True
        # For indices where count > 50, we only keep top 50 scores
        for idx in tqdm(large_indices):
            idx_mask = (indices == idx)
            values, idxs = scores[idx_mask].topk(top_k)
            a = mask[idx_mask]
            a[idxs] = True
            mask[idx_mask] = a
            
        return mask.cpu()

    
   
    if filter_top_k:
        print('for skill job edges keep top k edges per job, k is ',top_k)
        e = ('skills', 'job_skill', 'jobs')
        rev_e = (e[2],'rev_'+e[1],e[0])
        cache_dir = 'cache'
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        
        mask_path = os.path.join(cache_dir, f'mask{top_k}.pt') 
        
        if os.path.isfile(mask_path):
            mask = torch.load(mask_path)
        else:
            mask = top_k_mask(data[e].edge_attr.squeeze(1), data[e].edge_index[1,:], top_k)
            torch.save(mask, mask_path) 
        
        data[e].edge_attr = data[e].edge_attr[mask]
        data[rev_e].edge_attr = data[rev_e].edge_attr[mask]
        data[e].edge_index = data[e].edge_index[:,mask]
        data[rev_e].edge_index = data[rev_e].edge_index[:,mask]
        print('keep',torch.sum(mask), 'of total',mask.shape[0])
    
    
    from torch_geometric import seed_everything
    import torch_geometric.transforms as T
    from torch_geometric.utils import sort_edge_index


    
    edge_types = []
    rev_edge_types = []
    for edge_type in data.edge_types:
        if edge_type[1].startswith('rev_'):
            rev_edge_types.append(edge_type)
        else:
            edge_types.append(edge_type)

    transform = T.RandomLinkSplit(
        is_undirected=True,
        edge_types=edge_types,
        rev_edge_types=rev_edge_types,
        num_val=0.02,
        num_test=0.05,
        add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
        neg_sampling_ratio=1.0,
        disjoint_train_ratio=0.3, #  training edges are shared for message passing and supervision
        )

    seed_everything(14)
    # sort by col to speed up sampling later (we can sepcify is_sorted=True in link neighbor loader)
    # we actually dont use the sort, because it seems to mess up things, but have not checked if everything works without sorting, so we leave it here
    def sort_edges(data):
        for edge_type in data.edge_types:
            if 'edge_attr' in data[edge_type].keys():
                data[edge_type].edge_index, data[edge_type].edge_attr = sort_edge_index(data[edge_type].edge_index, data[edge_type].edge_attr, sort_by_row=False) 
            else:
                data[edge_type].edge_index = sort_edge_index(data[edge_type].edge_index, sort_by_row=False) 
        return data
    

    def preprocess(data):
        if not get_edge_attr:
            # delete edge_attr of every edge type
            for edge_type in data.edge_types:
                del data[edge_type].edge_attr 

        # delete all keys for every node type except 'x' (e.g. description and title)
        for node_type in data.node_types:
            keys = list(data[node_type].keys())
            for key in keys:
                if key != 'x':
                    del data[node_type][key]
        return data
    
    
    # change all types to float32 and normalize the triangle columns
    for node_type in data.node_types:
        for i in range(data[node_type].x.shape[1]):
            if data[node_type].x[:,i].max()>5:
                #normalize
                print('normalizing column ', i, ' of node type ', node_type)
                data[node_type].x[:,i] = data[node_type].x[:,i]/data[node_type].x[:,i].max()
        
        data[node_type].x = data[node_type].x.to(torch.float32)
        
    
    
    train_data, val_data, test_data = transform(data)
    #train_data = sort_edges(train_data)
    #val_data = sort_edges(val_data)
    #test_data = sort_edges(test_data)
    if remove_text_attr:
        train_data = preprocess(train_data)
        val_data = preprocess(val_data)
        test_data = preprocess(test_data)
    
    return train_data, val_data, test_data




from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.loader import HGTLoader
from torch_geometric.sampler import NegativeSampling

def get_hgt_linkloader(data, target_edge, batch_size, is_training, sampling_mode, neg_ratio, num_neighbors_hgtloader, num_workers, prefetch_factor, pin_memory):
    # first sample some edges in linkNeighborLoader
    # use the nodes of the sampled edges to sample from hgt loader
    
    
    num_neighbors_linkloader = [0]
    #for edge_type in data.edge_types:
    #    num_neighbors_linkloader[edge_type] = [0,0]
    
    negative_sampling = NegativeSampling(
        mode=sampling_mode, # binary or triplet
        amount=neg_ratio  # ratio, like Graphsage # 10
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )
    
    if sampling_mode == 'triplet':
        data[target_edge].edge_label = None
        

    linkNeighborLoader = LinkNeighborLoader(
            data,
            num_neighbors=num_neighbors_linkloader,
            edge_label_index=(target_edge, data[target_edge].edge_label_index), # if (edge, None), None means all edges are considered
        
            neg_sampling=negative_sampling, # adds negative samples
            batch_size=batch_size,
            shuffle=is_training, #is_training
            subgraph_type='directional', # contains only sampled edges
            #drop_last=True,
            num_workers=num_workers,
            #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
        
            #num_workers=2,
            #prefetch_factor=2
            is_sorted = False,
            pin_memory=pin_memory,
            prefetch_factor=prefetch_factor,
    )
   
   
    def get_hgt(data, input_nodetype, input_mask):
        return next(iter(HGTLoader(
                data,
                # Sample 512 nodes per type and per iteration for 4 iterations
                num_samples=num_neighbors_hgtloader,
                batch_size=input_mask.shape[0],
                input_nodes=(input_nodetype, input_mask),
                num_workers=num_workers,
                pin_memory=pin_memory,
                prefetch_factor=prefetch_factor,
            )))
        
    
    def add_self_loops(data):
        for node_type in data.node_types:
            data[node_type, 'self_loop', node_type].edge_index = torch.arange(data[node_type].num_nodes).repeat(2,1)
        return data 

            
    def get_hgt_with_selfloops(loader):
        
        
        for batch in loader:   
            if sampling_mode=='triplet':      
                # original edge_label_index from the whole data object
                unmapped_batchids = torch.cat((batch[target_edge[0]].src_index,batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index.flatten()))
                original_node_ids = batch[target_edge[0]].n_id[unmapped_batchids]
                original_edge_label_nodes = torch.LongTensor(original_node_ids.unique())

                #remapping or sorting is not needed, since nodes are sorted, also in the htg batch, the edges will be the same
                src = batch[target_edge[0]].src_index.unsqueeze(0)
                src_total = src
                for i in range(neg_ratio):
                    src_total = torch.cat((src_total,src), dim=1)
                dst = torch.cat((batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index.flatten()),dim=0).unsqueeze(0)
                
                local_edge_label_index = torch.cat((src_total, dst),dim=0)
                edge_label = torch.cat((torch.ones(batch[target_edge[2]].dst_pos_index.shape[0]), torch.zeros(batch[target_edge[2]].dst_neg_index.flatten().shape[0])))
                
            elif sampling_mode=='binary':
                unmapped_batchids = batch[target_edge].edge_label_index.flatten()
                original_node_ids = batch[target_edge[0]].n_id[unmapped_batchids]
                original_edge_label_nodes = torch.LongTensor(original_node_ids.unique())

            else:
                raise Exception('binary or triplet sampling mode')
                
                
            hgt_batch = get_hgt(data, target_edge[0], original_edge_label_nodes) # 0,1,3,4,5,6,7,8,9,
          
            if sampling_mode=='triplet':
                
                # return message passing edges, and supervision edges/labels, ignore labels/label_indices in the message passing edges
                yield add_self_loops(hgt_batch), local_edge_label_index, edge_label, batch[target_edge].input_id, original_node_ids
            else: # sampling_mode=='binary':
                # return message passing edges, and supervision edges/labels, ignore labels/label_indices in the message passing edges, as well as original edge indices
                yield add_self_loops(hgt_batch), batch[target_edge].edge_label_index, batch[target_edge].edge_label, batch[target_edge].input_id, original_node_ids
    
    def get_hgt_2types_with_selfloops(loader):
        for batch in loader:
            if sampling_mode=='triplet':   
                original_src_nodes = batch[target_edge[0]].n_id[batch[target_edge[0]].src_index]
                original_edge_label_nodes_class1 = torch.LongTensor(original_src_nodes.unique())
                
                original_dst_nodes = batch[target_edge[2]].n_id[torch.cat((batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index.flatten()))]
                original_edge_label_nodes_class2 = torch.LongTensor(original_dst_nodes.unique())
                
                
                src = batch[target_edge[0]].src_index.unsqueeze(0)
                src_total = src
                for i in range(neg_ratio):
                    src_total = torch.cat((src_total,src), dim=1)
                
                dst = torch.cat((batch[target_edge[2]].dst_pos_index, batch[target_edge[2]].dst_neg_index.flatten()),dim=0).unsqueeze(0)
                
                local_edge_label_index = torch.cat((src_total, dst),dim=0)
                edge_label = torch.cat((torch.ones(batch[target_edge[2]].dst_pos_index.shape[0]), torch.zeros(batch[target_edge[2]].dst_neg_index.flatten().shape[0])))

            elif sampling_mode=='binary':
                original_src_nodes = batch[target_edge[0]].n_id[batch[target_edge].edge_label_index[0,:]]
                original_edge_label_nodes_class1 = original_src_nodes.unique()
                original_dst_nodes = batch[target_edge[2]].n_id[batch[target_edge].edge_label_index[1,:]]
                original_edge_label_nodes_class2 = original_dst_nodes.unique()

            else:
                raise Exception('binary or triplet sampling mode')

            # batch the start and end supervision nodes separately
            hgt_batch1 = get_hgt(data, target_edge[0], original_edge_label_nodes_class1)
            hgt_batch2 = get_hgt(data, target_edge[2], original_edge_label_nodes_class2)
            
            
            # ** We dont need to remove any edges ** since the supervision edges wont be sampled by hgt
            if sampling_mode=='triplet':
                yield add_self_loops(hgt_batch1), add_self_loops(hgt_batch2), local_edge_label_index, edge_label, batch[target_edge].input_id, original_src_nodes, original_dst_nodes
            else: # sampling_mode=='binary':
                # we can access the corresponding nodes of edge_label_index[0,:] in hgt_batch1[target_edge[0]], those of [1,:] in hgt_batch2...
                yield add_self_loops(hgt_batch1), add_self_loops(hgt_batch2), batch[target_edge].edge_label_index, batch[target_edge].edge_label, batch[target_edge].input_id, original_src_nodes, original_dst_nodes

        
    if target_edge[0] == target_edge[2]:
        # same edge type, only need to sample once
        return get_hgt_with_selfloops(linkNeighborLoader)
    else:
        return get_hgt_2types_with_selfloops(linkNeighborLoader)


import random

def get_minibatch_count(data, batch_size):
    batches = []
    for edge_type in data.edge_types:
        if edge_type[1].startswith('rev_'):
            continue
        batches.extend([edge_type for _ in range((data[edge_type].edge_label_index.shape[1]+batch_size)//batch_size)])
        
    return len(batches)

def get_single_minibatch_count(data, batch_size, edge_type):
    return (data[edge_type].edge_label_index.shape[1]+batch_size)//batch_size

def uniform_hgt_sampler(data, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory):

    # return batches from all edgetypes with each "edge" being drawn uniformly at random (but we translate the probabilities to batches), last batches of each edge type may be smaller than batch_size
    batches = []
    loaders = {}
    # only the non-reverse edge types for now
    for edge_type in data.edge_types:
        if edge_type[1].startswith('rev_'):
            continue
        batches.extend([edge_type for _ in range((data[edge_type].edge_label_index.shape[1]+batch_size)//batch_size)])
        loaders[edge_type]=get_hgt_linkloader(data, edge_type, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory)
        
    random.seed(14)
    random.shuffle(batches)
    # set a random random seed again (may affect creating the loaders later for a second epoch)
    random.seed()
    
    print('total batches:', len(batches))
    
    for target_edge_type in batches:
        if target_edge_type[0] == target_edge_type[2]:
            same_nodetype = True
        else:
            same_nodetype = False
        try:
            minibatch = next(loaders[target_edge_type])
        except StopIteration: # "reinit" iterator
            loaders[target_edge_type] = get_hgt_linkloader(data, target_edge_type, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory)#iter(loaders[target_edge_type])
            minibatch = next(loaders[target_edge_type])
            pass

        yield same_nodetype, target_edge_type, minibatch
        
def sampler_for_init(data, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory):
    batchcount=[]
    batches=[]
    loaders = {}
    for edge_type in data.edge_types:
        if edge_type[1].startswith('rev_'):
            continue
        batchcount.extend([edge_type for _ in range((data[edge_type].edge_label_index.shape[1]+batch_size)//batch_size)])
        batches.append(edge_type)
        loaders[edge_type]=get_hgt_linkloader(data, edge_type, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory)
        
    print('total batches:', len(batchcount))

    
    # set a random random seed again (may affect creating the loaders later for a second epoch)
    random.seed()
    
    for target_edge_type in batches:
        if target_edge_type[0] == target_edge_type[2]:
            same_nodetype = True
        else:
            same_nodetype = False
        try:
            minibatch = next(loaders[target_edge_type])
        except StopIteration: # "reinit" iterator
            loaders[target_edge_type] = get_hgt_linkloader(data, target_edge_type, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory)#iter(loaders[target_edge_type])
            minibatch = next(loaders[target_edge_type])
            pass
            
        yield same_nodetype, target_edge_type, minibatch

def equal_edgeweight_hgt_sampler(data, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory):
    batchcount=[]
    batches=[]
    loaders = {}
    for edge_type in data.edge_types:
        if edge_type[1].startswith('rev_'):
            continue
        batchcount.extend([edge_type for _ in range((data[edge_type].edge_label_index.shape[1]+batch_size)//batch_size)])
        batches.append(edge_type)
        loaders[edge_type]=get_hgt_linkloader(data, edge_type, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory)
        
    print('total batches:', len(batchcount))

    batches = random.choices(batches, k=len(batchcount)) 
    # set a random random seed again (may affect creating the loaders later for a second epoch)
    random.seed()
    
   
    
    for target_edge_type in batches:
        if target_edge_type[0] == target_edge_type[2]:
            same_nodetype = True
        else:
            same_nodetype = False
        try:
            minibatch = next(loaders[target_edge_type])
        except StopIteration: # "reinit" iterator
            loaders[target_edge_type] = get_hgt_linkloader(data, target_edge_type, batch_size, is_training, sampling_mode, neg_sampling_ratio, num_neighbors, num_workers, prefetch_factor, pin_memory)#iter(loaders[target_edge_type])
            minibatch = next(loaders[target_edge_type])
            pass
            
        yield same_nodetype, target_edge_type, minibatch

In [3]:
#from graph_sampler import get_datasets, equal_edgeweight_hgt_sampler, get_minibatch_count, add_reverse_edge_original_attributes_and_label_inplace, get_hgt_linkloader, get_single_minibatch_count, sampler_for_init

train_data, val_data, test_data = get_datasets(get_edge_attr=False, filename=ROOT_FOLDER+'HeteroData_Learnings_normalized_triangles_withadditionaldata_v1.pt', filter_top_k=True, top_k=50)


size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


In [4]:
import torch
from models.TransE import TransE
from models.DistMult import DistMult
from models.HGT import HGT
import torch_geometric
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Model(torch.nn.Module):
    def __init__(self, gnn : torch.nn.Module, head , node_types, edge_types, ggn_output_dim, pnorm=1):
        super().__init__()
        # edge_type onehot lookup table with keys
        # node_type onehot lookup table with keys
        self.node_type_embedding = torch.nn.Embedding(len(node_types), ggn_output_dim) # hidden channels should be the output dim of gnn
        
        self.edge_types = edge_types
        for edge_type in edge_types:
            if edge_type[1].startswith('rev_'):
                self.edge_types.remove(edge_type)
        
        # create edge to int mapping
        self.edgeindex_lookup = {edge_type:torch.tensor(i)  for i, edge_type in enumerate(edge_types)}
            
        # hidden channels should be the output dim of gnn
        if head=='TransE': 
            self.head = TransE(len(node_types), len(edge_types) , ggn_output_dim, p_norm= pnorm, margin=0.5)  # KGE head with loss function
        elif head=='DistMult':
            self.head = DistMult(len(node_types), len(edge_types) , ggn_output_dim, p_norm= pnorm, margin=0.5)  # KGE head with loss function
        else:
            raise NotImplementedError
        
        self.gnn = gnn
        
    

    def forward(self, hetero_data1, target_edge_type, edge_label_index, edge_label, hetero_data2=None, get_head_fn='loss'):
        
        if hetero_data2 is not None:
            assert target_edge_type[0] != target_edge_type[2], 'when passing two data objects, the edge type has to contain two different node types'
            head_embeddings = self.gnn(hetero_data1.x_dict, hetero_data1.edge_index_dict)[target_edge_type[0]][edge_label_index[0,:]]
            tail_embeddings = self.gnn(hetero_data2.x_dict, hetero_data2.edge_index_dict)[target_edge_type[2]][edge_label_index[1,:]]
        else:
            assert target_edge_type[0] == target_edge_type[2], 'when passing one data object, the edge type has to contain the same node types'


            embeddings = self.gnn(hetero_data1.x_dict, hetero_data1.edge_index_dict)
            head_embeddings = embeddings[target_edge_type[0]][edge_label_index[0,:]]
            tail_embeddings = embeddings[target_edge_type[2]][edge_label_index[1,:]]

        
        edgeindex = self.edgeindex_lookup[target_edge_type]
        if get_head_fn=='loss':
            loss = self.head.loss(head_embeddings, edgeindex.to(device), tail_embeddings, edge_label)
            return loss
        elif get_head_fn=='forward':
            return self.head.forward(head_embeddings, edgeindex.to(device), tail_embeddings)
    
        
metadata = train_data.metadata()
# add selfloops
for node_type in train_data.node_types:
    metadata[1].append((node_type, 'self_loop', node_type))    
    
out_channels = 256
hidden_channels = 256
num_heads = 8
num_layers = 2
pnorm = 2
head = 'TransE'
gnn = HGT(hidden_channels=out_channels, out_channels=out_channels, num_heads=num_heads, num_layers=num_layers, node_types=train_data.node_types, data_metadata=metadata)

model = Model(gnn, head=head, node_types=metadata[0], edge_types=metadata[1], ggn_output_dim=out_channels, pnorm=pnorm)
#torch_geometric.compile(model, dynamic=True)
model.to(device)


In [None]:
# init model
from tqdm.auto import tqdm
from datetime import datetime
batch_size = 32
num_node_types = len(train_data.node_types)
print('num_node_types', num_node_types)
one_hop_neighbors = (20 * batch_size)//num_node_types # per relationship type
two_hop_neighbors = (20 * 8 * batch_size)//num_node_types # per relationship type
three_hop_neighbors = (20 * 8 * 3 * batch_size)//num_node_types # per relationship type
num_neighbors = [one_hop_neighbors, two_hop_neighbors] # three_hop_neighbors
# num_neighbors [36, 363, 1454]

print('num_neighbors', num_neighbors)
print('avg_num_neighbors', [num_neighbors[0]/batch_size,num_neighbors[1]/batch_size,  num_neighbors[2]/batch_size if len(num_neighbors)==3 else 0 ])

sampler_for_init = equal_edgeweight_hgt_sampler(train_data, batch_size, True, 'triplet', 1, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True) 



In [None]:
# have to init until all node types are present
model.eval()
for i, (same_nodetype, target_edge_type, minibatch) in tqdm(enumerate(sampler_for_init)):

    # batching is different depending on if node types in edge are same or different
    print(target_edge_type)
    if same_nodetype:
        
        minibatch, edge_label_index, edge_label, input_edge_ids, global_node_ids = minibatch
        #print(minibatch['jobs'].x.device, edge_label_index.device, edge_label.device)
        loss = model(minibatch.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device))
        #loss, pos, neg = model(minibatch, target_edge_type, edge_label_index, edge_label)
    else:
        try:
            minibatchpart1, minibatchpart2, edge_label_index, edge_label, input_edge_id, global_src_ids, global_dst_ids = minibatch
        except ValueError as err:
            print('value error', err)
            continue # for skill qual edges sometimes for some reason only 5 instead of 7 elements returned
        #print(minibatchpart1['jobs'].device, minibatchpart2['jobs'].device, edge_label_index.device, edge_label.device)
        loss = model(minibatchpart1.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device), minibatchpart2.to(device))

In [None]:
# get parameters form statedcit
model.load_state_dict = torch.load(ROOT_FOLDER+'models/learningall_hgt_20231104_123858_margin05_pnorm2_llr0.0002_bs32_neighbors_106_853_head_TransE_hiddenchannels_256_outchannels_256_numheads_8_numlayers_2/')

In [21]:
# evaluate model on test set
# init dimensions of model by training it
from tqdm.auto import tqdm
from datetime import datetime
import numpy as np 

def evaluate(model, n_negatives, model_folder, on='test'):
    num_neighbors = [int(x) for x in model_folder.split('neighbors_')[1].split('head')[0].strip('_').split('_')]
    
    model.to(device)
    mrrs = []
   
    
    #test_sampler = get_hgt_linkloader(test_data, input_edgetype, 1, False, 'triplet', n_negatives, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True)
    



    # test data
    train_data_text, val_data_text, test_data_text = get_datasets(get_edge_attr=False, filename=ROOT_FOLDER+'HeteroData_Learnings_normalized_triangles_withadditionaldata_v1.pt', filter_top_k=True, top_k=50, remove_text_attr=False)
    input_edgetype = ('people', 'rev_course_and_programs_student', 'courses_and_programs')
    if on=='test':
        add_reverse_edge_original_attributes_and_label_inplace(test_data['courses_and_programs', 'course_and_programs_student', 'people'], reverse_edge=test_data[input_edgetype] )
        test_sampler = get_hgt_linkloader(test_data, input_edgetype, 1, False, 'triplet', n_negatives, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True)
    elif on =='train':
        add_reverse_edge_original_attributes_and_label_inplace(train_data['courses_and_programs', 'course_and_programs_student', 'people'], reverse_edge=train_data[input_edgetype] )
        test_sampler = get_hgt_linkloader(train_data, input_edgetype, 1, False, 'triplet', n_negatives, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True)
    # test data
    model.eval()
    mrr_per_edge_type = {}
    rank_per_edge_type = {}
    best_mrr, best_differences, best_src_nodes, best_dst_nodes = 0, None, None, None
    for i, (same_nodetype, target_edge_type, minibatch) in tqdm(enumerate(sampler_for_init)):
        if i==1000:
            break
        
        print(target_edge_type)
        if same_nodetype:
            minibatch, edge_label_index, edge_label, input_edge_ids, global_node_ids = minibatch
            #print(minibatch['jobs'].x.device, edge_label_index.device, edge_label.device)
            differences = model(minibatch.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device))
            #loss, pos, neg = model(minibatch, target_edge_type, edge_label_index, edge_label)
        else:
            try:
                minibatchpart1, minibatchpart2, edge_label_index, edge_label, input_edge_id, global_src_ids, global_dst_ids = minibatch
            except ValueError as err:
                print('value error', err)
                continue # for skill qual edges sometimes for some reason only 5 instead of 7 elements returned
            #print(minibatchpart1['jobs'].device, minibatchpart2['jobs'].device, edge_label_index.device, edge_label.device)
            differences = model(minibatchpart1.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device), minibatchpart2.to(device))
        #loss.backward()

        # define mrr with differences and labels
        # get rank of positive edge from tensor, positive edge is first in batch

        differences = -1* differences.cpu().detach().numpy()
        edge_label = edge_label.cpu().detach().numpy()
        rank = (differences < differences[0]).sum()

        # reciprocal
        mrr = 1/(rank+1)
        if mrr > best_mrr:
            best_mrr = mrr
            best_differences = differences
            best_src_nodes = global_src_nodes
            best_dst_nodes = global_dst_nodes
            print('new best mrr', best_mrr)
            
    
        mrrs.append(mrr)
        mrr_per_edge_type[target_edge_type] = mrr_per_edge_type.get(target_edge_type, []) + [mrr]
        rank_per_edge_type[target_edge_type] = rank_per_edge_type.get(target_edge_type, []) + [rank]
        print('mrr',target_edge_type,mrr)
        print(rank)
        
    print('mean mrr',np.mean(mrrs))
    print('mean rank',1/np.mean(mrrs))
    # mean rank
    model.to('cpu')


In [30]:
torch.cat((test_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_index[0,:].unique(),test_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique())).unique().shape

torch.Size([32494])

In [57]:
a = set(test_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique().tolist())
a2 = set(train_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique().tolist())

In [42]:
b = set(torch.cat((train_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_index[0,:].unique(),train_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique())).tolist())

In [47]:
len(a.intersection(b))

9335

In [62]:
# evaluate(model_3layereuclid, 10000, model_folder3layereuclid, on='test')
# evaluate(model_2layereuclid, 10000, model_folder2layereuclid, on='test')
evaluate(model_2layerp1, 10000, model_folder2layerp1, on='test')

size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:06,  6.70s/it]

new best mrr 0.0024752475247524753


3it [00:12,  3.57s/it]

new best mrr 0.05


4it [00:14,  3.11s/it]

new best mrr 1.0


1000it [45:27,  2.73s/it]


mean mrr 0.17139854320945072
mean rank 5.834355305914065


In [None]:
evaluate(model_3layereuclid, 10000, model_folder3layereuclid, on='train')
evaluate(model_2layereuclid, 10000, model_folder2layereuclid, on='train')
evaluate(model_2layerp1, 10000, model_folder2layerp1, on='train')

size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:15, 15.19s/it]

new best mrr 8.13206473123526e-05


2it [00:23, 10.85s/it]

new best mrr 0.000500751126690035


4it [00:36,  8.10s/it]

new best mrr 0.2


8it [01:00,  6.52s/it]

new best mrr 1.0


200it [23:41,  7.11s/it]


mean mrr 0.13983406037281088
mean rank 7.151333497245987
size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:09,  9.78s/it]

new best mrr 0.0001484560570071259


2it [00:13,  6.03s/it]

new best mrr 0.005494505494505495


4it [00:18,  3.63s/it]

new best mrr 0.16666666666666666


8it [00:27,  2.44s/it]

new best mrr 0.3333333333333333


31it [01:25,  2.68s/it]

new best mrr 1.0


200it [09:34,  2.87s/it]


mean mrr 0.08684513895969696
mean rank 11.514749264942502
size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:05,  5.00s/it]

new best mrr 0.00016452780519907864


2it [00:07,  3.52s/it]

new best mrr 0.02127659574468085


4it [00:12,  2.73s/it]

new best mrr 0.1111111111111111


8it [00:20,  2.25s/it]

new best mrr 1.0


200it [10:11,  3.06s/it]


mean mrr 0.11901925778350261
mean rank 8.402001647657823


In [None]:
evaluate(modele4, 50000, model_foldere4)

In [None]:
evaluate(modele4euclidean, 50000, model_foldere4euclidean)

In [None]:
evaluate(modele5, 50000, model_foldere5, on='train')

In [None]:
evaluate(modele4, 50000, model_foldere4, on='train')

In [None]:
evaluate(modele4euclidean, 30000, model_foldere4euclidean, on='train')