In [None]:
!pip install pykeen

Collecting pykeen
  Downloading pykeen-1.10.2-py3-none-any.whl.metadata (83 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.8/83.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.5.6-py3-none-any.whl.metadata (8.9 kB)
Collecting docdata (from pykeen)
  Downloading docdata-0.0.4-py3-none-any.whl.metadata (13 kB)
Collecting class-resolver>0.4.2 (from pykeen)

In [None]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-7.0.0


In [None]:
import pandas as pd
import pykeen
import numpy as np
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from SPARQLWrapper import SPARQLWrapper, JSON
import concurrent.futures
from math import log

INFO:pykeen.utils:Using opt_einsum


In [None]:
projects_has_related_projects = pd.read_csv('./projects_has_releted_projects.csv')

In [None]:
projects = np.concatenate([pd.unique(projects_has_related_projects['project']), pd.unique(projects_has_related_projects['subject'])])

In [None]:
def get_projects_attributes(project_id):
  sparql_query = """
      PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      PREFIX arkc: <https://xxx.xxx.ie/Ontologies/ARKCube#>
      PREFIX arkp: <https://xxx.xxx.ie/Ontologies/ARKPlatform#>

      SELECT DISTINCT ?subject ?predicate ?object
      WHERE {
          ?subject ?predicate ?object .
          FILTER (?subject = <%(subject)s> && NOT EXISTS {
              ?subject (arkp:hasCollaborator | arkp:hasOwner | arkp:hasReadOnlyUser) ?object .
          })
      }
  """

  sparql = SPARQLWrapper("https://stagingark.xxx.ie/fuseki/ark_data/query")

  sparql.setQuery(sparql_query % {'subject': project_id})

  sparql.setReturnFormat(JSON)

  results = sparql.query().convert()

  df = pd.DataFrame(results["results"]["bindings"])

  # Extract the values from the DataFrame
  df['subject'] = df['subject'].apply(lambda x: x['value'])
  df['predicate'] = df['predicate'].apply(lambda x: x['value'])
  df['object'] = df['object'].apply(lambda x: x['value'])
  return df

In [None]:
def get_projects_text(project_id):
  sparql_query = """
      PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      PREFIX arkc: <https://openark.xxx.ie/Ontologies/ARKCube#>
      PREFIX arkp: <https://openark.xxx.ie/Ontologies/ARKPlatform#>

  SELECT DISTINCT
  ?subject
  ?predicate
  ?object
  WHERE {
    <%(subject)s> a arkc:Project .
    BIND(<%(subject)s> AS ?project)
    {
      # ?pa does not have text or concept
      ?project arkc:hasProjectAnalysis ?range .
      ?range a ?field .
      ?range arkc:describesStage ?stage .
      BIND(?range AS ?subject)
      ?subject ?predicate ?object .
      OPTIONAL {
        ?range arkc:textualDescription ?textualDescription .
      }

      OPTIONAL {
        ?range arkc:hasRelatedConcept ?concept .
      }
    } UNION
    {
      ?project arkc:hasProjectAnalysis ?pa .
      ?pa arkc:describesStage ?stage .
      ?pa (arkc:hasPotentialLoss | arkc:hasProjectedGain | arkc:hasProjectedReliabilityOfGain | arkc:hasReliabilityOfGain | arkc:hasGainAchieved) ?range .
      ?range a ?field .


      BIND(?range AS ?subject)


      ?subject ?predicate ?object .
      OPTIONAL {
      ?range arkc:textualDescription ?textualDescription .
      }

      OPTIONAL {
        ?range arkc:hasRelatedConcept ?concept .
      }
    } UNION
    {
      ?project arkc:hasProjectAnalysis ?pa .
      ?pa arkc:describesStage ?stage .
      ?pa (arkc:hasOutcome | arkc:hasMechanism | arkc:hasContext) ?range .
      ?range a ?field .


      BIND(?range AS ?subject)


      ?subject ?predicate ?object .
      OPTIONAL {
        ?range arkc:textualDescription ?textualDescription .
      }

      OPTIONAL {
        ?range arkc:hasRelatedConcept ?concept .
      }
    } UNION
    {
      ?project arkc:hasQuestionnaire/^arkc:isQuestionOf ?question .
      ?question arkc:describesStage ?stage .
      ?question arkc:hasAspect ?aspect .
      ?question arkc:hasDimension ?dimension .
      ?question arkc:hasAnswer ?range .
      ?range a ?field .
      BIND(?range AS ?subject)
      ?subject ?predicate ?object .
      ?range arkc:textualDescription ?textualDescription .


      OPTIONAL {
        ?range arkc:hasRelatedConcept ?concept .
      }


    } UNION
    {
      ?project arkc:hasQuestionnaire ?questionnaire .
      ?questionnaire arkc:hasQuestion ?question .
      ?question arkc:describesStage ?stage .
      ?question arkc:hasAspect ?aspect .
      ?question arkc:hasDimension ?dimension .
      ?question arkc:hasAnswer ?range .
      ?range a ?field .


      BIND(?range AS ?subject)


      ?subject ?predicate ?object .
      ?range arkc:textualDescription ?textualDescription .


      OPTIONAL {
        ?range arkc:hasRelatedConcept ?concept .
      }


    }
  }
  """

  sparql = SPARQLWrapper("https://stagingark.xxx.ie/fuseki/ark_data/query")

  sparql.setQuery(sparql_query % {'subject': project_id})

  sparql.setReturnFormat(JSON)


  results = sparql.query().convert()
  df = pd.DataFrame(results["results"]["bindings"])

  df['subject'] = df['subject'].apply(lambda x: x['value'])
  df['predicate'] = df['predicate'].apply(lambda x: x['value'])
  df['object'] = df['object'].apply(lambda x: x['value'])

  return df

In [None]:
def extract_entities_embeddings(df):
  triples_factory = TriplesFactory.from_labeled_triples(
    triples=df[['subject', 'predicate', 'object']].values,
  )

  training = triples_factory
  validation = triples_factory
  testing = triples_factory

  d=training
  id_to_entity={v: k for k, v in d.entity_to_id.items()}
  id_to_relation={v: k for k, v in d.relation_to_id.items()}

  # Display the first few triples
  triples_factory.triples

  result = pipeline(
    model='TransE',
    loss="softplus",
    training=training,
    testing=testing,
    validation=validation,
    model_kwargs=dict(embedding_dim=3),
    optimizer_kwargs=dict(lr=0.1),
    training_kwargs=dict(num_epochs=100, use_tqdm_batch=False),
  )

  model = result.model

  entity_embeddings = model.entity_representations[0](indices=None).detach().cpu().numpy()

  return entity_embeddings

In [None]:
project_dfs = {}
for item in projects:
    project_dfs[item] = get_projects_text(item)

In [None]:
projects_embeddings = {}
for key, value in project_dfs.items():
  value.dropna(axis=0,how='any',inplace=True)
  projects_embeddings[key] = extract_entities_embeddings(value)

INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/487 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.14s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/791 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.15s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/883 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.09s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/523 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.09s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/398 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.09s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/903 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.12s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/517 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.09s seconds
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/318 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.08s seconds


In [None]:
def embeddings_similarity_score(embeddings_A, embeddings_B):
  embeddings_A = embeddings_A.flatten().reshape(1,-1)
  embeddings_B = embeddings_B.flatten().reshape(1,-1)
  if embeddings_A.shape[1] > embeddings_B.shape[1]:
    num_zero_columns = embeddings_A.shape[1] - embeddings_B.shape[1]
    embeddings_B = np.pad(embeddings_B, ((0, 0), (0, num_zero_columns)), mode='constant')
  else:
    num_zero_columns = embeddings_B.shape[1] - embeddings_A.shape[1]
    embeddings_A = np.pad(embeddings_A, ((0, 0), (0, num_zero_columns)), mode='constant')

  cosine_similarity_score = cosine_similarity(embeddings_A,embeddings_B)
  return cosine_similarity_score[0]

In [None]:
elements = projects_embeddings

def similarity_score_caculate_pairs(pair):
    key1, key2 = pair
    return (key1, key2, embeddings_similarity_score(elements[key1], elements[key2])[0])


pairs = [(key1, key2) for key1 in elements.keys() for key2 in elements.keys() if key1 != key2]


with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(similarity_score_caculate_pairs, pairs))

df_results = pd.DataFrame(results, columns=['project1', 'project2', 'Similarity_Score'])

In [None]:
results= df_results.sort_values(by="Similarity_Score",ascending=False).groupby("project1", group_keys=True).apply(lambda a: a[:]).reset_index(drop=True)

In [None]:
results.to_csv("result_transE.csv", index=False)

In [None]:
y_true_dict = projects_has_related_projects.groupby('project')['subject'].apply(list).to_dict()

In [None]:
threshold = 0

In [None]:
results = pd.read_csv('./result_transE.csv')

In [None]:
results_dict = results[results['Similarity_Score'] > threshold].groupby('project1')['project2'].apply(list).to_dict()

In [None]:
def evluation(k, y_true_dict, similar_projects_dict):
  invalid_projects = []
  # Compute metrics
  precisions, recalls, ndcgs, hits, map_scores = [], [], [], [], []
  project_idxs = list(y_true_dict.keys())

  for pid in project_idxs:
      if pid not in similar_projects_dict or len(similar_projects_dict[pid]) < k:
          print(len(similar_projects_dict[pid]))
          invalid_projects.append(pid)
          continue
      pred_list, rel_set = similar_projects_dict[pid][:k], y_true_dict[pid]

      pred_real = "pid:"+str(pid)+' '+"pred_list:"+str(pred_list)+' '+"rel_set:"+str(rel_set)

      if len(pred_list) == 0:
          continue


      dcg = 0.0
      hit_num = 0.0
      for i in range(len(pred_list)):
          if pred_list[i] in rel_set:
              dcg += 1. / (log(i + 2) / log(2))
              hit_num += 1
      # idcg
      idcg = 0.0
      for i in range(min(len(rel_set), len(pred_list))):
          idcg += 1. / (log(i + 2) / log(2))
      ndcg = dcg / idcg
      recall = hit_num / len(rel_set)
      precision = hit_num / len(pred_list)
      hit = 1.0 if hit_num > 0.0 else 0.0

      #map
      map_score = 0.0
      num_hits = 0.0
      score = 0.0
      for i,p in enumerate(pred_list):
          if p in rel_set and p not in pred_list[:i]:
              num_hits+=1.0
              score+=num_hits/(i+1.0)
      map_score = score/min(len(rel_set),k)

      ndcgs.append(ndcg)
      recalls.append(recall)
      precisions.append(precision)
      hits.append(hit)
      map_scores.append(map_score)

  avg_precision = np.mean(precisions) * 100
  avg_recall = np.mean(recalls) * 100
  avg_ndcg = np.mean(ndcgs) * 100
  avg_hit = np.mean(hits) * 100
  avg_map = np.mean(map_scores) * 100

  #tmp = 'map: '+str(avg_map)+' '+'ndcg: '+str(avg_ndcg)+ ' '+'recall: '+str(avg_recall)+' '+'precision: '+str(avg_precision)+' '+str(len(invalid_projects))

  print("invalid projects:", str(len(invalid_projects)))
  print('MAP={:.3f} | NDCG={:.3f} |  Recall={:.3f} | Precision={:.3f} | Hits={:.3f}'.format(
          avg_map, avg_ndcg, avg_recall, avg_precision, avg_hit))

In [None]:
evluation(1,y_true_dict,results_dict)

invalid projects: 0
MAP=50.000 | NDCG=50.000 |  Recall=37.500 | Precision=50.000 | Hits=50.000
