In [None]:
%pip install torch --extra-index-url https://download.pytorch.org/whl/cu128

In [None]:
import torch
import pykeen
import pandas as pd
from pykeen import predict
from tqdm import tqdm
from pykeen.datasets import Nations
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
file_path = "data/all-triples.tsv"
tf = TriplesFactory.from_path(file_path)
training, testing = tf.split()

result = pipeline(
    training=training,
    testing=testing,
    model="TransE",
    model_kwargs=dict(
        embedding_dim=128,
    ),
    training_kwargs=dict(num_epochs=20),
    optimizer_kwargs=dict(
        lr=0.01,
    ),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1,
    ),
    random_seed=2025,
    device=device,
)

using automatically assigned random_state=595865306
Training epochs on cuda:0: 100%|██████████| 20/20 [09:59<00:00, 29.97s/epoch, loss=0.827, prev_loss=0.848]
Evaluating on cuda:0: 100%|██████████| 125k/125k [49:44<00:00, 41.9triple/s]    
INFO:pykeen.evaluation.evaluator:Evaluation took 3089.84s seconds


In [None]:
evaluation = result.metric_results.to_dict()
evaluation["both"]["realistic"]

{'arithmetic_mean_rank': 46402.50390625,
 'adjusted_arithmetic_mean_rank_index': 0.5973290901397814,
 'harmonic_mean_rank': 56.25301742553711,
 'inverse_arithmetic_mean_rank': 2.1550560632022098e-05,
 'z_arithmetic_mean_rank': 516.8924445406119,
 'median_absolute_deviation': 23518.51953125,
 'standard_deviation': 62948.8046875,
 'inverse_geometric_mean_rank': 0.00010474642476765439,
 'adjusted_inverse_harmonic_mean_rank': 0.01772143769065738,
 'adjusted_geometric_mean_rank_index': 0.8871474310432702,
 'adjusted_arithmetic_mean_rank': 0.4026760934198168,
 'count': 250368.0,
 'geometric_mean_rank': 9546.865234375,
 'variance': 3962552064.0,
 'median_rank': 16897.0,
 'inverse_median_rank': 5.9182104450883344e-05,
 'z_inverse_harmonic_mean_rank': 3309.2226051967723,
 'inverse_harmonic_mean_rank': 0.01777682453393936,
 'z_geometric_mean_rank': 443.01781981447516,
 'hits_at_1': 0.010200984151329244,
 'hits_at_3': 0.016539653629856852,
 'hits_at_5': 0.0218478399795501,
 'hits_at_10': 0.032456

In [5]:
model_name = "models/TransE-citations"
result.save_to_directory(model_name)

INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=233568, num_relations=31, create_inverse_triples=False, num_triples=500732, path="C:\Users\mirxm\Storage\Work\MDS\S2\SDM\MDS-SDM-KnowledgeGraphs\data\all-triples.tsv") to file:///C:/Users/mirxm/Storage/Work/MDS/S2/SDM/MDS-SDM-KnowledgeGraphs/models/TransE-citations/training_triples
INFO:pykeen.pipeline.api:Saved to directory: C:\Users\mirxm\Storage\Work\MDS\S2\SDM\MDS-SDM-KnowledgeGraphs\models\TransE-citations


In [47]:
p = "http://localhost:7200/academia-sdm#"
paper = f"<{p}d5db5bea38363c0d5cdee5800d4ad8f8ace7e223>"
paperCites = f"<{p}paperCites>"
entity_embeddings = result.model.entity_representations[0]
relation_embeddings = result.model.relation_representations[0]
sub_id = result.training.entity_to_id[paper]
sub_rep = entity_embeddings(indices=torch.as_tensor([sub_id], device=device))
rel_id = result.training.relation_to_id[paperCites]
rel_rep = relation_embeddings(indices=torch.as_tensor([rel_id], device=device))
pred_paper = sub_rep + rel_rep
pred_paper

tensor([[-0.7020, -0.8736, -0.5925,  0.7052,  0.8225,  0.3836,  0.8972,  0.5763,
         -0.5136, -0.8015, -0.6995,  0.6513,  0.6877, -0.5894, -0.4934,  0.5679,
         -0.4491, -0.7458,  0.4608,  0.4312,  0.8349,  0.5733,  0.6251, -0.4017,
          0.4085,  0.9795,  0.8753, -0.9154, -0.7960,  0.7691, -0.6825,  0.9012,
          0.5441,  0.3967, -0.7976, -0.3305, -0.5742,  0.6769,  0.8904, -0.6991,
         -0.6508,  0.7800, -0.6128,  0.8578,  0.7282, -0.6481,  0.6588,  0.3019,
          0.5706,  0.5211, -0.5264,  0.7440,  0.2343,  0.5904, -0.6826, -0.3179,
          0.8127, -0.2189, -0.5577, -0.8598, -0.8841,  0.6710,  0.7654, -0.7284,
          0.4767,  0.5447, -0.6710, -0.6777,  0.6289, -0.5766,  0.5180,  0.1861,
         -0.4970,  0.7441,  0.8478,  0.7972,  0.4674,  0.6009, -0.7228, -0.7455,
          0.6057,  0.7872, -0.5417, -0.8778, -0.4841,  0.6332, -0.5395,  0.5828,
          0.4850,  0.6832, -0.5006,  0.5851, -0.5848,  0.4399, -0.5171,  0.5813,
         -0.8746, -0.6425,  

In [48]:
writesPaper = f"<{p}writesPaper>"
writes_id = result.training.relation_to_id[writesPaper]
writes_rep = relation_embeddings(indices=torch.as_tensor([writes_id], device=device))
pred_auth = pred_paper + writes_rep
pred_auth

tensor([[ 0.2924, -1.3298, -1.6635,  1.7594,  1.7414, -0.1195,  1.6844,  1.2653,
         -0.9800, -1.3469, -1.5663, -0.1245,  0.2189, -1.5981, -0.0238,  1.3253,
          0.3497, -1.4278, -0.5327, -0.1061,  1.7843,  0.0371,  0.0336,  0.1283,
          1.1998,  1.8891,  0.0280, -0.2929, -1.8597,  0.2589, -1.2185,  1.5007,
          1.1800, -0.4292, -1.6534,  0.1060, -0.9417,  1.6023,  1.4339, -1.6423,
         -0.1597,  1.2568, -0.9730,  1.4017,  1.4420, -0.0043,  1.4725, -0.3619,
          0.1293,  0.1423,  0.0130,  0.3503, -0.4665, -0.2198, -0.3074, -1.1988,
          0.2273,  0.4109, -0.4733, -1.2696, -1.4892, -0.1301,  1.6104, -1.6864,
          1.5754,  0.0726,  0.0929,  0.2118,  1.4135,  0.0440,  0.8931, -0.6286,
          0.3418,  0.3278,  1.4843,  1.7535, -0.3342,  1.3228,  0.2171, -1.3634,
          1.5368,  0.0378, -0.0444, -1.4668,  0.5159, -0.0210,  0.4502,  0.1435,
         -0.4131,  0.0848, -0.9556, -0.2609, -1.2237, -0.0265, -1.4569, -0.1677,
         -0.2719, -0.0621,  

In [63]:
with open("data/all-authors.tsv", "r") as f:
    authors = set(f.read().splitlines())

In [64]:
best_auth = None
best_auth_dist = -1.0
for auth, auth_id in tqdm(result.training.entity_to_id.items()):
    if auth not in authors:
        continue
    auth_rep = entity_embeddings(indices=torch.as_tensor([auth_id], device=device))
    dist = torch.cdist(pred_auth, auth_rep)
    if dist < best_auth_dist or best_auth is None:
        best_auth = auth
        best_auth_dist = dist

print(f"Best author: {best_auth} with distance {best_auth_dist.item()}")

100%|██████████| 233568/233568 [00:03<00:00, 61895.77it/s] 

Best author: <http://localhost:7200/academia-sdm#47030051> with distance 10.815679550170898



