In [None]:
import pandas as pd
import pykeen
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import concurrent.futures
from math import log
import re
import gensim
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
import transformers
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
import torch
from torch import nn
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch.nn.functional as F
from torch.utils.data import DataLoader
import requests
import time
import requests, tempfile
from io import BytesIO
from transformers import pipeline
import ast

INFO:pykeen.utils:Using opt_einsum
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
df = pd.read_csv('./dbpedia_irish_places_triples_v2.tsv', sep="\t", names=['head','relation','tail'])

In [None]:
df.head()

Unnamed: 0,head,relation,tail
0,"http://dbpedia.org/resource/Holywell,_Swords",rdfs:label,"Holywell, Swords"
1,"http://dbpedia.org/resource/Holywell,_Swords",dbo:abstract,Holywell (Irish: Tobar Naofa) is a neighbourho...
2,"http://dbpedia.org/resource/Holywell,_Swords",dbo:country,http://dbpedia.org/resource/Republic_of_Ireland
3,"http://dbpedia.org/resource/Holywell,_Swords",dcterms:subject,http://dbpedia.org/resource/Category:Neighbour...
4,http://dbpedia.org/resource/Category:Neighbour...,skos:broader,"http://dbpedia.org/resource/Category:Swords,_D..."


In [None]:
populated_places = pd.read_csv('Irish_populated_places.csv')['place'].unique()

In [None]:
def generate_embeddings(triples_file, target_entities, model_name='RotatE'):

    df = pd.read_csv(triples_file, sep="\t", names=['head','relation','tail'])


    direct_triples = df[df['head'].isin(target_entities) | df['tail'].isin(target_entities)]
    subjects = set(direct_triples['tail'][direct_triples['relation'] == 'dcterms:subject'])
    broader_subjects = set(df['tail'][df['head'].isin(subjects) & (df['relation'] == 'skos:broader')])

    all_subjects = subjects.union(broader_subjects)
    subject_triples = df[df['head'].isin(all_subjects) | df['tail'].isin(all_subjects)]
    type_relations = df[(df['relation'] == 'rdf:type') & (df['head'].isin(direct_triples['tail']))]


    combined_triples = pd.concat([direct_triples, subject_triples, type_relations]).drop_duplicates()

    if combined_triples.empty:
        print("No relevant triples found for the given entities.")
        return None


    triples_array = combined_triples[['head', 'relation', 'tail']].values
    print(f"Number of related triples found: {len(triples_array)}")
    print(triples_array)


    triples_factory = TriplesFactory.from_labeled_triples(triples_array)

    model_kwargs = dict(embedding_dim=100)
    if model_name == 'TransH':
        model_kwargs['relation_dim'] = 50  # TransH requires relation_dim

    result = pykeen.pipeline.pipeline(
        model=model_name,
        training=triples_factory,
        validation=triples_factory,
        testing=triples_factory,
        training_kwargs=dict(num_epochs=300, use_tqdm_batch=False),
        optimizer_kwargs=dict(lr=0.1),
        model_kwargs=model_kwargs,
    )


    model = result.model

    entity_embeddings = model.entity_representations[0](indices=None).detach().cpu().numpy()

    id_to_entity = {v: k for k, v in triples_factory.entity_to_id.items()}

    entity_embedding_dict = {}

    for entity in target_entities:
        entity_id = triples_factory.entity_to_id.get(entity)
        if entity_id is not None:
            entity_embedding = entity_embeddings[entity_id]
            entity_embedding_dict[entity] = entity_embedding
            print(f"Embedding for {entity}: {entity_embedding[:5]}...")
        else:
            print(f"Entity {entity} not found in the trained model.")

    return entity_embedding_dict


entity_embeddings = generate_embeddings("dbpedia_irish_places_triples_v2.tsv", populated_places, model_name='RotatE')

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Number of related triples found: 2825
[['http://dbpedia.org/resource/Holywell,_Swords' 'rdfs:label'
  'Holywell, Swords']
 ['http://dbpedia.org/resource/Holywell,_Swords' 'dbo:abstract'
  "Holywell (Irish: Tobar Naofa) is a neighbourhood near Swords, Dublin, Ireland. Developed and marketed in phases and under various names, Feltrim Hall, Gorse Hill, Abbey Stone, Holywell and The Meadows, all the street names in the community include the common name Holywell e.g. 'Holywell Drive'. It is the eastern part of the census town of Kinsealy–Drinan, separated from the western part by the M1 motorway."]
 ['http://dbpedia.org/resource/Holywell,_Swords' 'dbo:country'
  'http://dbpedia.org/resource/Republic_of_Ireland']
 ...
 ['http://dbpedia.org/resource/City_status_in_Ireland' 'rdf:type'
  'http://dbpedia.org/ontology/Place']
 ['http://dbpedia.org/resource/Capital_city' 'rdf:type'
  'http://dbpedia.org/ontology/City']
 ['http://dbpedia.org/resource/Capital_city' 'rdf:type'
  'http://dbpedia.org/o



Training epochs on cpu:   0%|          | 0/300 [00:00<?, ?epoch/s]



Evaluating on cpu:   0%|          | 0.00/2.83k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 37.87s seconds


Embedding for http://dbpedia.org/resource/Holywell,_Swords: [ 1.999136  +2.0390997j  0.84975123+2.4695225j  1.22292   +1.6114016j
 -3.976648  +1.3584534j  0.71216094+1.1501954j]...
Embedding for http://dbpedia.org/resource/Leyny: [ 4.182246 +4.1992292j   3.3957686+2.2061894j  -0.9555179-3.0078902j
  3.0175092+0.66954017j -2.1532474-4.3345103j ]...
Embedding for http://dbpedia.org/resource/County_Cavan: [ 0.02907994-0.02442312j  1.6273282 -0.28416654j  2.1518743 +2.1167235j
 -2.216592  -1.6800848j   3.5918205 +1.4456336j ]...
Embedding for http://dbpedia.org/resource/County_Cork: [-1.9195461-2.0918083j   1.1118373-0.21020867j  2.9342036+1.8075879j
 -5.074683 +1.1959394j   1.5442896+2.4558487j ]...
Embedding for http://dbpedia.org/resource/County_Galway: [-4.067013 +0.48516747j  5.7840443-3.7350833j   3.023813 +1.2288835j
 -4.2155676-2.8274267j   2.7771816+1.7734277j ]...
Embedding for http://dbpedia.org/resource/County_Kerry: [-6.15366   +2.6632297j  3.3320742 -2.1827073j  0.49367145-1.

In [None]:
def compute_similar_places(entity_repr, top_k=10):
    entities = list(entity_repr.keys())
    all_embeddings = torch.stack([entity_repr[e] for e in entities])  # [N, D]
    sims = F.cosine_similarity(all_embeddings.unsqueeze(1), all_embeddings.unsqueeze(0), dim=-1)

    similar_places_dict = {}
    for i, e in enumerate(entities):
        sim_scores = sims[i]
        topk = torch.topk(sim_scores, k=top_k+1)  # +1 to skip itself
        indices = topk.indices.tolist()
        similar_entities = [entities[j] for j in indices if j != i][:top_k]
        similar_places_dict[e] = similar_entities
    return similar_places_dict

In [None]:
entity_embeddings_tensor = {
    k: torch.tensor(v, dtype=torch.float32)
    for k, v in entity_embeddings.items()
}

  k: torch.tensor(v, dtype=torch.float32)


In [None]:
entity_repr = entity_embeddings_tensor

In [None]:
ground_truth = pd.read_csv("./ground_truth.csv")
y_true_dict = ground_truth.groupby('geographical_entity_1')['geographical_entity_2'].apply(list).to_dict()

In [None]:
similar_places = compute_similar_places(entity_repr, top_k=10)

In [None]:
def evluation(k, y_true_dict, similar_places_dict):
  invalid_places = []
  # Compute metrics
  precisions, recalls, ndcgs, hits, map_scores,mrrs = [], [], [], [], [], []
  place_idxs = list(y_true_dict.keys())

  for pid in place_idxs:
      if pid not in similar_places_dict or len(similar_places_dict[pid]) < k:
          invalid_places.append(pid)
          continue
      pred_list, rel_set = similar_places_dict[pid][:k], y_true_dict[pid]
      pred_real = "pid:"+str(pid)+' '+"pred_list:"+str(pred_list)+' '+"rel_set:"+str(rel_set)

      if len(pred_list) == 0:
          continue


      dcg = 0.0
      hit_num = 0.0
      for i in range(len(pred_list)):
          if pred_list[i] in rel_set:
              dcg += 1. / (log(i + 2) / log(2))
              hit_num += 1
      # idcg
      idcg = 0.0
      for i in range(min(len(rel_set), len(pred_list))):
          idcg += 1. / (log(i + 2) / log(2))
      ndcg = dcg / idcg
      recall = hit_num / len(rel_set)
      precision = hit_num / len(pred_list)
      hit = 1.0 if hit_num > 0.0 else 0.0

      #map
      map_score = 0.0
      num_hits = 0.0
      score = 0.0
      for i,p in enumerate(pred_list):
          if p in rel_set and p not in pred_list[:i]:
              num_hits+=1.0
              score+=num_hits/(i+1.0)
      map_score = score/min(len(rel_set),k)
      #map_score = score / min(len(rel_set), len(pred_list)) if len(rel_set) > 0 else 0.0

      #MRR
      rr = 0.0
      for i, p in enumerate(pred_list):
          if p in rel_set:
              rr = 1.0 / (i + 1.0)
              break

      ndcgs.append(ndcg)
      recalls.append(recall)
      precisions.append(precision)
      hits.append(hit)
      map_scores.append(map_score)
      mrrs.append(rr)

  avg_precision = np.mean(precisions) * 100
  avg_recall = np.mean(recalls) * 100
  avg_ndcg = np.mean(ndcgs) * 100
  avg_hit = np.mean(hits) * 100
  avg_map = np.mean(map_scores) * 100
  avg_mrr = np.mean(mrrs) * 100

  print("invalid places:", str(len(invalid_places)))
  print('MAP={:.3f} | NDCG={:.3f} |  Recall={:.3f} | Precision={:.3f} | Hits={:.3f} | MRR={:.3f}'.format(
          avg_map, avg_ndcg, avg_recall, avg_precision, avg_hit, avg_mrr))

In [None]:
#kg embedding RotatE
evluation(k=10, y_true_dict=y_true_dict, similar_places_dict=similar_places)

invalid places: 0
MAP=22.508 | NDCG=37.191 |  Recall=17.088 | Precision=33.345 | Hits=94.198 | MRR=66.128


In [None]:
#kg embedding Tucker
evluation(k=10, y_true_dict=y_true_dict, similar_places_dict=similar_places)

invalid places: 0
MAP=21.487 | NDCG=34.600 |  Recall=15.855 | Precision=31.058 | Hits=93.174 | MRR=63.278
