# Dataset loading from txt


In [1]:
from pathlib import Path

In [2]:
_train_path = Path("WN18RR/train.txt")
_test_path = Path("WN18RR/test.txt")
_valid_path = Path("WN18RR/valid.txt")

In [3]:
def load_dataset(path:Path) -> list[tuple]:
    """
    parses dataset path into list of tuples.
    """
    datalist = []
    with open(path, "r") as f:
        for line in f:
            head, relation,tail = line.strip().split("\t")
            datalist.append((head,relation,tail))
            
    return datalist

In [4]:
train_dataset = load_dataset(_train_path)
test_dataset  = load_dataset(_test_path)
valid_dataset = load_dataset(_valid_path)

# Data Processing and Visualisation
Understanding spread of data, edge types, nodes, etc.

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
def split_entity_relation(dataset: list) -> (list, list):
    """
    takes in dataset in the form of list containing triple tuples.
    splits into sets of entities and relations, sorting them to maintain order.
    This is done for initialising of deep learning models later.
    :returns sorted set of entities, relations
    """
    entities = sorted({h for h,_,_ in dataset }|{ t for _,_,t in dataset})
    relations = sorted({r for _,r,_ in dataset})
    return entities, relations

entities_train, relation_train = split_entity_relation(train_dataset)
entities_valid, relation_valid = split_entity_relation(valid_dataset)
entities_test,  relation_test  = split_entity_relation(test_dataset)

In [7]:
print(f"number of unique nodes in train: {len(entities_train)}")
print(f"number of unique nodes in valid: {len(entities_valid)}")
print(f"number of unique nodes in test : {len(entities_test)}")



print(f"number of intersecting nodes in train + valid: {len(set(entities_train) & set(entities_valid))}"
      f"\t\tunseen percentage: "
      f"{(len(set(entities_valid)) - len(set(entities_train) & set(entities_valid))) / len(set(entities_valid)) * 100:.2f}%")

print(f"number of intersecting nodes in train + test : {len(set(entities_train) & set(entities_test))}"
      f"\t\tunseen percentage: "
      f"{(len(entities_test) - len(set(entities_train) & set(entities_test))) / len(set(entities_test)) * 100:.2f}%")

print(f"number of intersecting nodes in valid + test : {len(set(entities_valid) & set(entities_test))}")


number of unique nodes in train: 40559
number of unique nodes in valid: 5173
number of unique nodes in test : 5323
number of intersecting nodes in train + valid: 4975		unseen percentage: 3.83%
number of intersecting nodes in train + test : 5114		unseen percentage: 3.93%
number of intersecting nodes in valid + test : 1026


1. Overlaps Ensure Inference is possible
If validation/test nodes had no overlap with training, we would have no embeddings for them, so general inferencing would become impossible.


2.	Spread ensures challenge:
The few unseen nodes prevent the task from being trivial memorization. They push your model to rely on neighborhood structure, not just IDs.

In [8]:
print(f"number of unique edges (train): {len(set(relation_train))}")
print(f"number of unique edges (valid): {len(set(relation_valid))}")
print(f"number of unique edges (test) : {len(set(relation_test))} ")

number of unique edges (train): 11
number of unique edges (valid): 11
number of unique edges (test) : 11 


This is correct as any models will not be able to infer based on an unseen edge type.

Hence, having the same number of unique edge types ensure proper inferencing. 

In [9]:
import random
_random_sample_entity = entities_train[random.randint(0, len(entities_train))]
print(f"sample of entity: {_random_sample_entity}")
print(f"type of entity  : {type(_random_sample_entity)}")

print("\n")
_random_sample_relation = relation_train[random.randint(0, len(relation_train))]
print(f"sample of relation: {_random_sample_relation}")
print(f"type of relation  : {type(_random_sample_relation)}")
print("\nWe want to map these strings to integer indexes for training as lightGCN expects node IDs as continuous integers.")

sample of entity: 01858441
type of entity  : <class 'str'>


sample of relation: _also_see
type of relation  : <class 'str'>

We want to map these strings to integer indexes for training as lightGCN expects node IDs as continuous integers.


In [30]:
def map_entity_values(unmapped_values: list) -> dict[str, int]:
    return {val: i for i, val in enumerate(unmapped_values)}

print("Mapping of entities to consecutive values")
all_entities = sorted(set(entities_test + entities_train + entities_valid))
all_entities_mapped = map_entity_values(all_entities)
print(len(all_entities_mapped))


Mapping of entities to consecutive values
40943


# Prepare Data to Pass into LightGCN

```LightGCN``` :expects 2 types of data when using hetero:


```x_dict```   : which is the dictionary of embedding related to that node (trainable)


```edge_index_dict```  : which is the ```{('entity', relation, 'entity'): tensor[[src],[dst]]}``` 
 

we also add new inverse type of relation on top of the 11 that already exists.
this allows for information to be passed around which originally didnt.
A -> B is one way, and there should be an inverse relationship (or some information) which is missed out.

In [44]:
from torch_geometric.data import HeteroData
import torch
from collections import defaultdict
from typing import List, Tuple, Dict

data = HeteroData()

data['entity'].num_nodes = len(all_entities_mapped)

# create truple
def build_edge_index_dict(
        dataset: List[tuple[str, str, str]],  # list of (h, r, t) triples
        mapped_entity: Dict[str, int],
        add_reverse: bool = True
) -> Dict[Tuple[str, str, str], torch.Tensor]:
    
    edge_index_dict = {}
    relation_dict = defaultdict(list)
    # adds relation : (src, dst)
    for h,r,t in dataset:
        relation_dict[r].append((mapped_entity[h],mapped_entity[t]))

        # adds relation : (dst, src)
        if add_reverse:
            reverse_r = r+'_inv'
            relation_dict[reverse_r].append((mapped_entity[t],mapped_entity[h]))

    for r,v in relation_dict.items():
        # v is the [(src,dst)]
        src_list, dst_list = [], []
         
        for src,dst in v:
            src_list.append(src)
            dst_list.append(dst)
        src_tensor = torch.Tensor(src_list)
        dst_tensor = torch.Tensor(dst_list)
        src_dst_tensor = torch.vstack((src_tensor,dst_tensor))
        
        edge_index_dict[('entity',r,'entity')] = src_dst_tensor
        
    
    return edge_index_dict    
        

edge_index_dict = build_edge_index_dict(train_dataset, all_entities_mapped, True)

for rel, edge_index in edge_index_dict.items():
    data[rel].edge_index = edge_index

In [39]:
data

HeteroData(
  entity={ num_nodes=40943 },
  (entity, _hypernym, entity)={ edge_index=[2, 34796] },
  (entity, _hypernym_inv, entity)={ edge_index=[2, 34796] },
  (entity, _derivationally_related_form, entity)={ edge_index=[2, 29715] },
  (entity, _derivationally_related_form_inv, entity)={ edge_index=[2, 29715] },
  (entity, _instance_hypernym, entity)={ edge_index=[2, 2921] },
  (entity, _instance_hypernym_inv, entity)={ edge_index=[2, 2921] },
  (entity, _also_see, entity)={ edge_index=[2, 1299] },
  (entity, _also_see_inv, entity)={ edge_index=[2, 1299] },
  (entity, _member_meronym, entity)={ edge_index=[2, 7402] },
  (entity, _member_meronym_inv, entity)={ edge_index=[2, 7402] },
  (entity, _synset_domain_topic_of, entity)={ edge_index=[2, 3116] },
  (entity, _synset_domain_topic_of_inv, entity)={ edge_index=[2, 3116] },
  (entity, _has_part, entity)={ edge_index=[2, 4816] },
  (entity, _has_part_inv, entity)={ edge_index=[2, 4816] },
  (entity, _member_of_domain_usage, entity)={ 

now that we have created and preped the data that will be used for training,
we will create embedding for each node that will be trainable.

In [45]:
import torch
import torch.nn as nn
FEATURE_SIZE = 64
embedding_matrix = torch.nn.Embedding(len(all_entities_mapped), FEATURE_SIZE)
nn.init.xavier_uniform_(embedding_matrix.weight)

Parameter containing:
tensor([[-0.0091,  0.0020, -0.0089,  ..., -0.0081, -0.0003,  0.0105],
        [ 0.0114, -0.0030,  0.0113,  ...,  0.0075, -0.0035,  0.0006],
        [-0.0097,  0.0097,  0.0074,  ...,  0.0043,  0.0011,  0.0040],
        ...,
        [-0.0121,  0.0009, -0.0045,  ...,  0.0053, -0.0114, -0.0097],
        [-0.0005, -0.0017, -0.0017,  ..., -0.0057,  0.0111,  0.0085],
        [ 0.0114,  0.0028,  0.0108,  ...,  0.0002,  0.0118, -0.0076]],
       requires_grad=True)

# LightGCN Model definition

In [20]:
import torch
import torch.nn as nn
from torch_geometric.nn import LGConv, to_hetero

class LightGCN(nn.Module):
    def __init__(self):
        super().__init__()
        
