In [1]:
# Mean Reciprocal Rank

In [2]:
%load_ext autoreload
%autoreload 2

In [2]:
! pip install torch==2.1.0  torchvision==0.16.0 torchtext==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
#! pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html # torch_spline_conv
! pip install torch_geometric
! pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
#! pip install torch_sparse -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
#! pip install torch_scatter -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
#! pip install pyg_lib -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
! pip install sentence-transformers
! pip install torcheval
! pip install matplotlib
! pip install pandas
! pip install tensorboard
! pip install weaviate-client

! pip install -U pip setuptools wheel
! pip install -U spacy
! python -m spacy download en_core_web_sm

In [3]:
from graph_sampler import get_datasets, equal_edgeweight_hgt_sampler, get_minibatch_count, add_reverse_edge_original_attributes_and_label_inplace, get_hgt_linkloader, get_single_minibatch_count, sampler_for_init

train_data, val_data, test_data = get_datasets(get_edge_attr=False, filename=ROOT_FOLDER+'HeteroData_Learnings_normalized_triangles_withadditionaldata_v1.pt', filter_top_k=True, top_k=50)


size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


In [4]:
import torch
from models.TransE import TransE
from models.DistMult import DistMult
from models.HGT import HGT
import torch_geometric
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Model(torch.nn.Module):
    def __init__(self, gnn : torch.nn.Module, head , node_types, edge_types, ggn_output_dim, pnorm=1):
        super().__init__()
        # edge_type onehot lookup table with keys
        # node_type onehot lookup table with keys
        self.node_type_embedding = torch.nn.Embedding(len(node_types), ggn_output_dim) # hidden channels should be the output dim of gnn
        
        self.edge_types = edge_types
        for edge_type in edge_types:
            if edge_type[1].startswith('rev_'):
                self.edge_types.remove(edge_type)
        
        # create edge to int mapping
        self.edgeindex_lookup = {edge_type:torch.tensor(i)  for i, edge_type in enumerate(edge_types)}
            
        # hidden channels should be the output dim of gnn
        if head=='TransE': 
            self.head = TransE(len(node_types), len(edge_types) , ggn_output_dim, p_norm= pnorm, margin=0.5)  # KGE head with loss function
        elif head=='DistMult':
            self.head = DistMult(len(node_types), len(edge_types) , ggn_output_dim, p_norm= pnorm, margin=0.5)  # KGE head with loss function
        else:
            raise NotImplementedError
        
        self.gnn = gnn
        
    

    def forward(self, hetero_data1, target_edge_type, edge_label_index, edge_label, hetero_data2=None, get_head_fn='loss'):
        
        if hetero_data2 is not None:
            assert target_edge_type[0] != target_edge_type[2], 'when passing two data objects, the edge type has to contain two different node types'
            head_embeddings = self.gnn(hetero_data1.x_dict, hetero_data1.edge_index_dict)[target_edge_type[0]][edge_label_index[0,:]]
            tail_embeddings = self.gnn(hetero_data2.x_dict, hetero_data2.edge_index_dict)[target_edge_type[2]][edge_label_index[1,:]]
        else:
            assert target_edge_type[0] == target_edge_type[2], 'when passing one data object, the edge type has to contain the same node types'


            embeddings = self.gnn(hetero_data1.x_dict, hetero_data1.edge_index_dict)
            head_embeddings = embeddings[target_edge_type[0]][edge_label_index[0,:]]
            tail_embeddings = embeddings[target_edge_type[2]][edge_label_index[1,:]]

        
        edgeindex = self.edgeindex_lookup[target_edge_type]
        if get_head_fn=='loss':
            loss = self.head.loss(head_embeddings, edgeindex.to(device), tail_embeddings, edge_label)
            return loss
        elif get_head_fn=='forward':
            return self.head.forward(head_embeddings, edgeindex.to(device), tail_embeddings)
    
        
metadata = train_data.metadata()
# add selfloops
for node_type in train_data.node_types:
    metadata[1].append((node_type, 'self_loop', node_type))    
    
out_channels = 256
hidden_channels = 256
num_heads = 8
num_layers = 2
pnorm = 2
head = 'TransE'
gnn = HGT(hidden_channels=out_channels, out_channels=out_channels, num_heads=num_heads, num_layers=num_layers, node_types=train_data.node_types, data_metadata=metadata)

model = Model(gnn, head=head, node_types=metadata[0], edge_types=metadata[1], ggn_output_dim=out_channels, pnorm=pnorm)
#torch_geometric.compile(model, dynamic=True)
model.to(device)


In [None]:
# init model
from tqdm.auto import tqdm
from datetime import datetime
batch_size = 32
num_node_types = len(train_data.node_types)
print('num_node_types', num_node_types)
one_hop_neighbors = (20 * batch_size)//num_node_types # per relationship type
two_hop_neighbors = (20 * 8 * batch_size)//num_node_types # per relationship type
three_hop_neighbors = (20 * 8 * 3 * batch_size)//num_node_types # per relationship type
num_neighbors = [one_hop_neighbors, two_hop_neighbors] # three_hop_neighbors
# num_neighbors [36, 363, 1454]

print('num_neighbors', num_neighbors)
print('avg_num_neighbors', [num_neighbors[0]/batch_size,num_neighbors[1]/batch_size,  num_neighbors[2]/batch_size if len(num_neighbors)==3 else 0 ])

sampler_for_init = equal_edgeweight_hgt_sampler(train_data, batch_size, True, 'triplet', 1, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True) 



In [None]:
# have to init until all node types are present
model.eval()
for i, (same_nodetype, target_edge_type, minibatch) in tqdm(enumerate(sampler_for_init)):

    # batching is different depending on if node types in edge are same or different
    print(target_edge_type)
    if same_nodetype:
        
        minibatch, edge_label_index, edge_label, input_edge_ids, global_node_ids = minibatch
        #print(minibatch['jobs'].x.device, edge_label_index.device, edge_label.device)
        loss = model(minibatch.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device))
        #loss, pos, neg = model(minibatch, target_edge_type, edge_label_index, edge_label)
    else:
        try:
            minibatchpart1, minibatchpart2, edge_label_index, edge_label, input_edge_id, global_src_ids, global_dst_ids = minibatch
        except ValueError as err:
            print('value error', err)
            continue # for skill qual edges sometimes for some reason only 5 instead of 7 elements returned
        #print(minibatchpart1['jobs'].device, minibatchpart2['jobs'].device, edge_label_index.device, edge_label.device)
        loss = model(minibatchpart1.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device), minibatchpart2.to(device))

In [None]:
# get parameters form statedcit
model.load_state_dict = torch.load(ROOT_FOLDER+'models/learningall_hgt_20231104_123858_margin05_pnorm2_llr0.0002_bs32_neighbors_106_853_head_TransE_hiddenchannels_256_outchannels_256_numheads_8_numlayers_2/')

In [21]:
# evaluate model on test set
# init dimensions of model by training it
from tqdm.auto import tqdm
from datetime import datetime
import numpy as np 

def evaluate(model, n_negatives, model_folder, on='test'):
    num_neighbors = [int(x) for x in model_folder.split('neighbors_')[1].split('head')[0].strip('_').split('_')]
    
    model.to(device)
    mrrs = []
   
    
    #test_sampler = get_hgt_linkloader(test_data, input_edgetype, 1, False, 'triplet', n_negatives, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True)
    



    # test data
    train_data_text, val_data_text, test_data_text = get_datasets(get_edge_attr=False, filename=ROOT_FOLDER+'HeteroData_Learnings_normalized_triangles_withadditionaldata_v1.pt', filter_top_k=True, top_k=50, remove_text_attr=False)
    input_edgetype = ('people', 'rev_course_and_programs_student', 'courses_and_programs')
    if on=='test':
        add_reverse_edge_original_attributes_and_label_inplace(test_data['courses_and_programs', 'course_and_programs_student', 'people'], reverse_edge=test_data[input_edgetype] )
        test_sampler = get_hgt_linkloader(test_data, input_edgetype, 1, False, 'triplet', n_negatives, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True)
    elif on =='train':
        add_reverse_edge_original_attributes_and_label_inplace(train_data['courses_and_programs', 'course_and_programs_student', 'people'], reverse_edge=train_data[input_edgetype] )
        test_sampler = get_hgt_linkloader(train_data, input_edgetype, 1, False, 'triplet', n_negatives, num_neighbors, num_workers=0, prefetch_factor=None, pin_memory=True)
    # test data
    model.eval()
    mrr_per_edge_type = {}
    rank_per_edge_type = {}
    best_mrr, best_differences, best_src_nodes, best_dst_nodes = 0, None, None, None
    for i, (same_nodetype, target_edge_type, minibatch) in tqdm(enumerate(sampler_for_init)):
        if i==1000:
            break
        
        print(target_edge_type)
        if same_nodetype:
            minibatch, edge_label_index, edge_label, input_edge_ids, global_node_ids = minibatch
            #print(minibatch['jobs'].x.device, edge_label_index.device, edge_label.device)
            differences = model(minibatch.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device))
            #loss, pos, neg = model(minibatch, target_edge_type, edge_label_index, edge_label)
        else:
            try:
                minibatchpart1, minibatchpart2, edge_label_index, edge_label, input_edge_id, global_src_ids, global_dst_ids = minibatch
            except ValueError as err:
                print('value error', err)
                continue # for skill qual edges sometimes for some reason only 5 instead of 7 elements returned
            #print(minibatchpart1['jobs'].device, minibatchpart2['jobs'].device, edge_label_index.device, edge_label.device)
            differences = model(minibatchpart1.to(device), target_edge_type, edge_label_index.to(device), edge_label.to(device), minibatchpart2.to(device))
        #loss.backward()

        # define mrr with differences and labels
        # get rank of positive edge from tensor, positive edge is first in batch

        differences = -1* differences.cpu().detach().numpy()
        edge_label = edge_label.cpu().detach().numpy()
        rank = (differences < differences[0]).sum()

        # reciprocal
        mrr = 1/(rank+1)
        if mrr > best_mrr:
            best_mrr = mrr
            best_differences = differences
            best_src_nodes = global_src_nodes
            best_dst_nodes = global_dst_nodes
            print('new best mrr', best_mrr)
            
    
        mrrs.append(mrr)
        mrr_per_edge_type[target_edge_type] = mrr_per_edge_type.get(target_edge_type, []) + [mrr]
        rank_per_edge_type[target_edge_type] = rank_per_edge_type.get(target_edge_type, []) + [rank]
        print('mrr',target_edge_type,mrr)
        print(rank)
        
    print('mean mrr',np.mean(mrrs))
    print('mean rank',1/np.mean(mrrs))
    # mean rank
    model.to('cpu')


In [30]:
torch.cat((test_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_index[0,:].unique(),test_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique())).unique().shape

torch.Size([32494])

In [57]:
a = set(test_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique().tolist())
a2 = set(train_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique().tolist())

In [42]:
b = set(torch.cat((train_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_index[0,:].unique(),train_data['courses_and_programs', 'course_and_programs_student', 'people'].edge_label_index[0,:].unique())).tolist())

In [47]:
len(a.intersection(b))

9335

In [62]:
# evaluate(model_3layereuclid, 10000, model_folder3layereuclid, on='test')
# evaluate(model_2layereuclid, 10000, model_folder2layereuclid, on='test')
evaluate(model_2layerp1, 10000, model_folder2layerp1, on='test')

size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:06,  6.70s/it]

new best mrr 0.0024752475247524753


3it [00:12,  3.57s/it]

new best mrr 0.05


4it [00:14,  3.11s/it]

new best mrr 1.0


1000it [45:27,  2.73s/it]


mean mrr 0.17139854320945072
mean rank 5.834355305914065


In [None]:
evaluate(model_3layereuclid, 10000, model_folder3layereuclid, on='train')
evaluate(model_2layereuclid, 10000, model_folder2layereuclid, on='train')
evaluate(model_2layerp1, 10000, model_folder2layerp1, on='train')

size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:15, 15.19s/it]

new best mrr 8.13206473123526e-05


2it [00:23, 10.85s/it]

new best mrr 0.000500751126690035


4it [00:36,  8.10s/it]

new best mrr 0.2


8it [01:00,  6.52s/it]

new best mrr 1.0


200it [23:41,  7.11s/it]


mean mrr 0.13983406037281088
mean rank 7.151333497245987
size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:09,  9.78s/it]

new best mrr 0.0001484560570071259


2it [00:13,  6.03s/it]

new best mrr 0.005494505494505495


4it [00:18,  3.63s/it]

new best mrr 0.16666666666666666


8it [00:27,  2.44s/it]

new best mrr 0.3333333333333333


31it [01:25,  2.68s/it]

new best mrr 1.0


200it [09:34,  2.87s/it]


mean mrr 0.08684513895969696
mean rank 11.514749264942502
size of dataset on disk:  2.279761238 gb
loading saved heterodata object
for skill job edges keep top k edges per job, k is  50
keep tensor(1208056) of total 16289586


1it [00:05,  5.00s/it]

new best mrr 0.00016452780519907864


2it [00:07,  3.52s/it]

new best mrr 0.02127659574468085


4it [00:12,  2.73s/it]

new best mrr 0.1111111111111111


8it [00:20,  2.25s/it]

new best mrr 1.0


200it [10:11,  3.06s/it]


mean mrr 0.11901925778350261
mean rank 8.402001647657823


In [None]:
evaluate(modele4, 50000, model_foldere4)

In [None]:
evaluate(modele4euclidean, 50000, model_foldere4euclidean)

In [None]:
evaluate(modele5, 50000, model_foldere5, on='train')

In [None]:
evaluate(modele4, 50000, model_foldere4, on='train')

In [None]:
evaluate(modele4euclidean, 30000, model_foldere4euclidean, on='train')