# Pykeen test used for KGE embedding

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

import torch
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pathlib import Path

from utils import load_IDMapping, load_train_data, print_pykeen_metrics, save_embeddings

source_dir = "../data/dataset"
train_res = "./tmp"

current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
print(f"Current time: {current_time}")

columns = ['head', 'relation', 'tail']
entity2id, relation2id = load_IDMapping(source_dir)
support_triplet = pd.read_csv(source_dir + "/support_triplet.tsv", sep='\t', names=columns)
print(f" num_support_triplets: {len(support_triplet)}")

train_data, test_data, train_data_id, test_data_id = load_train_data(source_dir, entity2id, relation2id, 0)
train_data_pos = train_data[train_data['label'] == 1]
train_data_pos = train_data_pos[columns]

emb_graph = pd.concat([train_data_pos, support_triplet], ignore_index=True)

# Create a TriplesFactory from the DataFrame
emb_triplet = TriplesFactory.from_labeled_triples(emb_graph[columns].values, entity_to_id=entity2id, relation_to_id=relation2id, create_inverse_triples=False)
emb_trainning, emb_testing, emb_validation = emb_triplet.split([0.8, 0.1, 0.1])
print(f" num_emb_triplets: {emb_triplet.num_triples}")
print(f" emb_train_triplets: {emb_trainning.num_triples}, emb_test_triplets: {emb_testing.num_triples}, emb_valid_triplets: {emb_validation.num_triples}")

test_data_tf = TriplesFactory.from_labeled_triples(test_data[columns].values, entity_to_id=entity2id, relation_to_id=relation2id, create_inverse_triples=False)

# Create a model

  from .autonotebook import tqdm as notebook_tqdm


Current time: 20250110-164143
 num_entity: 945552
 num_relation: 126
 num_support_triplets: 20930149
Load data from ../data/dataset and 1 fold:
 num_train_triples: 1879648
 num_test_triples: 469912


using automatically assigned random_state=1455997829


 num_emb_triplets: 22809797
 emb_train_triplets: 18247837, emb_test_triplets: 2280980, emb_valid_triplets: 2280980


In [2]:
model = 'CompGCN'
proj_name = f"{model}_{current_time}"

pipeline_result = pipeline(
    random_seed=42,
    model=model,
    model_kwargs=dict(
        embedding_dim=100,
    ),

    training=emb_trainning,
    testing=emb_testing,
    validation=emb_validation,

    training_loop='sLCWA',
    training_kwargs=dict(
        num_epochs=10,
        batch_size=10000,
        use_tqdm_batch=False,
        checkpoint_name=f'{proj_name}_checkpoint.pt',
        checkpoint_directory='./tmp',
        checkpoint_frequency=3,
        checkpoint_on_failure=True,
        # sampler="schlichtkrull",
    ),
    
    # optimizer='Adam',
    # optimizer_kwargs=dict(
    #     lr=0.01,
    # ),

    negative_sampler='bernoulli',
    negative_sampler_kwargs=dict(
        num_negs_per_pos=1,
    ),

    evaluator='RankBasedEvaluator',
    evaluator_kwargs=dict(
        filtered=True,
    ),
    
    stopper='early',
    stopper_kwargs=dict(
        frequency=3,
        patience=2,
        relative_delta=0.005,
        best_model_path=Path(f'./tmp/{proj_name}_best-model-weights.pt'),
    ),

    device=torch.device("cuda:0"),
    # result_tracker='tensorboard',
    # result_tracker_kwargs=dict(
    #     experiment_path=f'tb_logs/{proj_name}',
    # ),
)
pipeline_result.save_to_directory(f'tmp/{proj_name}')
my_pykeen_model = pipeline_result.model
# print_pykeen_metrics(pipeline_result, test_data_tf, test_data, emb_trainning, emb_testing, emb_validation)

save_embeddings(my_pykeen_model, f'tmp/{proj_name}')

INFO:pykeen.training.training_loop:=> no checkpoint found at 'tmp/CompGCN_20250109-170658_checkpoint.pt'. Creating a new file.
INFO:pykeen.triples.triples_factory:Creating inverse triples.
Training epochs on cuda:0:   0%|          | 0/10 [00:00<?, ?epoch/s]INFO:pykeen.triples.triples_factory:Creating inverse triples.
INFO:pykeen.training.training_loop:Dropping last (incomplete) batch each epoch (1/364 (0.27%) batches).
Training epochs on cuda:0:  20%|██        | 2/10 [04:49<13:11, 98.90s/epoch, loss=0.113, prev_loss=0.201] INFO:pykeen.evaluation.evaluator:Evaluation took 151.05s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 3: 0.0951712459475229. Saved model weights to tmp/CompGCN_20250109-170658_best-model-weights.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 3.
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 3.
Training epochs on cuda:0:  50%|█████     | 5/10 [11:43<09:41, 116.35s/epoch,

In [4]:
import torch
import numpy as np
from pykeen.constants import PYKEEN_CHECKPOINTS
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

# proj_name = "TransR_20250109-160042"
checkpoint = torch.load(PYKEEN_CHECKPOINTS.joinpath(f"/home/worker/users/ZC/KnowledgeGraph/TarKG_reason/pykeen_model/tmp/{proj_name}_checkpoint.pt"))
# checkpoint_ent_emb = checkpoint['model_state_dict']['entity_representations.0._embeddings.weight'].detach().cpu().numpy()
# checkpoint_rel_emb = checkpoint['model_state_dict']['relation_representations.0._embeddings.weight'].detach().cpu().numpy()
# bestModel = torch.load(PYKEEN_CHECKPOINTS.joinpath(f"/home/worker/users/ZC/KnowledgeGraph/TarKG_reason/pykeen_model/tmp/{proj_name}_best-model-weights.pt"))
ent_emb = np.load(f"/home/worker/users/ZC/KnowledgeGraph/TarKG_reason/pykeen_model/tmp/{proj_name}/entity_embedding.npy")
rel_emb = np.load(f"/home/worker/users/ZC/KnowledgeGraph/TarKG_reason/pykeen_model/tmp/{proj_name}/relation_embedding.npy")
# print(checkpoint_ent_emb.shape, ent_emb.shape, checkpoint_rel_emb.shape, rel_emb.shape)
# print(np.sum(np.abs(ent_emb - checkpoint_ent_emb)))
checkpoint

{'epoch': 10,
 'loss': [0.885667965061717,
  0.201058605910985,
  0.11286946987876526,
  0.0773335728425901,
  0.05786939448380208,
  0.04626618692098738,
  0.03812688340402239,
  0.0325302512251905,
  0.02858645333345611,
  0.025266089921837653],
 'model_state_dict': OrderedDict([('entity_representations.0.combined.edge_index',
               tensor([[     0,      0,      0,  ..., 110407, 168001,  29520],
                       [ 11199,  14657,  14866,  ...,  43663,  67917,  38630]],
                      device='cuda:0')),
              ('entity_representations.0.combined.edge_type',
               tensor([  0,   0,   0,  ...,  47,  20, 103], device='cuda:0')),
              ('entity_representations.0.combined.entity_representations._embeddings.weight',
               tensor([[-1.9004, -1.0643,  0.2488,  ..., -0.6187,  0.2677,  0.4297],
                       [-1.2002,  0.1716, -0.4716,  ..., -0.5059, -0.1127,  0.0181],
                       [-2.1634, -0.6996, -0.4629,  ..., -1.9147

In [5]:
import numpy as np
for model in ['TransE_l2', 'TransR', 'RESCAL', 'DistMult', 'ComplEx', 'RotatE']:
    print(model)
    ent_emb = np.load(f"/home/worker/users/ZC/KnowledgeGraph/TarKG/KG_Reason/embedding/{model}_TarKG_0/TarKG_{model}_entity.npy")
    rel_emb = np.load(f"/home/worker/users/ZC/KnowledgeGraph/TarKG/KG_Reason/embedding/{model}_TarKG_0/TarKG_{model}_relation.npy")
    print(ent_emb.shape, rel_emb.shape)

emb_dir = "/home/worker/users/ZC/KnowledgeGraph/TarKG_reason/pykeen_model/tmp/RESCAL_20250109-165502"
pykeen_ent_emb = np.load(f'{emb_dir}/entity_embedding.npy')
pykeen_rel_emb = np.load(f'{emb_dir}/relation_embedding.npy')
print("Pykeen RESCAL")
print(pykeen_ent_emb.shape, pykeen_rel_emb.shape)

TransE_l2
(1143313, 600) (171, 600)
TransR
(1143313, 200) (171, 200)
RESCAL
(1143313, 200) (171, 40000)
DistMult
(1143313, 600) (171, 600)
ComplEx
(1143313, 600) (171, 600)
RotatE
(1143313, 1200) (171, 600)
Pykeen RESCAL
(197437, 100) (126, 100, 100)


In [None]:
from pykeen.models import TransE
from pykeen.optimizers import Adam
from pykeen.evaluation import RankBasedEvaluator
from pykeen.stoppers import EarlyStopper
from pykeen.training import SLCWATrainingLoop
from pykeen.pipeline import pipeline
# model
model = TransE(
    triples_factory=emb_trainning,
    random_seed=42,
)

# optimizer
optimizer = Adam(params=model.get_grad_params())
# Pick an evaluator
evaluator = RankBasedEvaluator(
    filtered=True,  # Note: this is True by default; we're just being explicit
)
# stopper
stopper = EarlyStopper(
    model=model,
    evaluator=evaluator,
    training_triples_factory=emb_trainning,
    evaluation_triples_factory=emb_validation,
    patience=2,
    relative_delta=0.1,
    metric='mean_rank',
)
# training approach (sLCWA or LCWA)
training_loop = SLCWATrainingLoop(
    model=model,
    triples_factory=emb_trainning,
    optimizer=optimizer,
)

# Train 
_ = training_loop.train(
    triples_factory=emb_trainning,
    num_epochs=100,
    batch_size=256,
    stopper=stopper,
)

# Get triples to test
mapped_triples = emb_testing.mapped_triples
# Evaluate
results = evaluator.evaluate(
    model=model,
    mapped_triples=mapped_triples,
    batch_size=1024,
    additional_filter_triples=[
        emb_trainning.mapped_triples,
        emb_validation.mapped_triples,
    ],
)

# MLP test for merged embedding

In [1]:
from utils import load_IDMapping, load_train_data, load_entity_feature, load_gene_disease_ids
import os
import numpy as np
import pandas as pd
model_res = "./train_results/CompGCN_20250109-215102"
data_dir = "../data/dataset"

entity2id, relation2id = load_IDMapping(data_dir)
train_data, test_data, train_data_id, test_data_id = load_train_data(data_dir, entity2id, relation2id, 0)

print("Start training KGE model embedding...")
kge_ent_emb = np.load(f'{model_res}/entity_embedding.npy')
kge_rel_emb = np.load(f'{model_res}/relation_embedding.npy')
print(kge_ent_emb.shape, kge_rel_emb.shape)
print("Loading gene and disease feature embeddings...")
gene_feat_emb, disease_feat_emb = load_entity_feature(data_dir)
gene_ids, disease_ids = load_gene_disease_ids(data_dir, entity2id)
print(len(gene_ids), len(disease_ids))
print(len(gene_feat_emb), len(disease_feat_emb))

  from tqdm.autonotebook import tqdm


 num_entity: 945552
 num_relation: 126
Load data from ../data/dataset and 1 fold:
 num_train_triples: 1879648
 num_test_triples: 469912
Start training KGE model embedding...
(945552, 20) (252, 20)
Loading gene and disease feature embeddings...
151231 26996
151231 26996


In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

def get_full_embeddings(entity_ids, emb_dict, dim):
    """
    将嵌入字典补齐到完整实体集合，缺失的实体补零向量。
    """
    full_emb = {}
    zero_vector = np.zeros(dim)
    for entity_id in entity_ids:
        full_emb[entity_id] = emb_dict.get(entity_id, zero_vector)
    return full_emb

## 2.convert to matrix
def dict_to_matrix(emb_dict, entity_ids):
    """
    将嵌入字典转换为矩阵，行对应实体，列为嵌入向量。
    """
    return np.array([emb_dict[eid] for eid in entity_ids])

## 3.principal component dimension reduction
def reduce_dim(embedding, entity_ids, target_dim):
    """
    使用PCA对嵌入矩阵降维。
    """
    emb_dim = next(iter(embedding.values())).shape[0]
    full_emb = get_full_embeddings(entity_ids, embedding, emb_dim)
    emb_matrix = dict_to_matrix(full_emb, entity_ids)

    mms = MinMaxScaler(feature_range=(0,1))
    emb_matrix_scaled = mms.fit_transform(emb_matrix)
    if emb_dim > target_dim:
        pca = PCA(n_components=target_dim, random_state=42)
        reduced_matrix_scaled = pca.fit_transform(emb_matrix_scaled)
        reduced_matrix_scaled1 = mms.fit_transform(reduced_matrix_scaled)
        return reduced_matrix_scaled1
    else:
        return emb_matrix_scaled
   
def get_merged_embeddings(kge_ent_emb, kge_rel_emb, entity2id, relation2id,
                           gene_feat_emb, disease_feat_emb, gene_ids, disease_ids, target_dim=300):
    mms = MinMaxScaler(feature_range=(0,1))
    
    entity_ids = disease_ids + gene_ids # the entity ids order is [disease,gene]
    kge_ent_emb = {id: kge_ent_emb[id] for name, id in entity2id.items()}
    kge_ent_reduced = reduce_dim(kge_ent_emb, entity_ids, target_dim)

    disease_feat_reduced = reduce_dim(disease_feat_emb, disease_ids, target_dim)
    gene_feat_reduced = reduce_dim(gene_feat_emb, gene_ids, target_dim)
    feat_reduced = np.concatenate([disease_feat_reduced, gene_feat_reduced])

    combined_ent_emb = np.concatenate([kge_ent_reduced, feat_reduced], axis=1)
    combined_ent_emb_scaled = mms.fit_transform(combined_ent_emb)
    combined_ent_emb_scaled_dict = {eid: combined_ent_emb_scaled[idx] for idx, eid in enumerate(entity_ids)}

    relation_ids = sorted(list(relation2id.values()))
    kge_rel_emb = {id: kge_rel_emb[id] for name, id in relation2id.items()}
    kge_rel_reduced = reduce_dim(kge_rel_emb, relation_ids, target_dim)
    rel_emb_scaled_dict = {eid: kge_rel_reduced[idx] for idx, eid in enumerate(relation_ids)}

    return combined_ent_emb_scaled_dict, rel_emb_scaled_dict, disease_feat_reduced, gene_feat_reduced, feat_reduced


In [3]:
ent_emb1, rel_emb1, dis1, gene1, feat1 = get_merged_embeddings(kge_ent_emb, kge_rel_emb, entity2id, relation2id,
                           gene_feat_emb, disease_feat_emb, gene_ids, disease_ids, target_dim=200)
ent_emb2, rel_emb2, dis2, gene2, feat2 = get_merged_embeddings(kge_ent_emb, kge_rel_emb, entity2id, relation2id,
                           gene_feat_emb, disease_feat_emb, gene_ids, disease_ids, target_dim=200)

are_equal_01 = np.array_equal(ent_emb1[0], ent_emb2[0])
print(are_equal_01)
print(np.array_equal(gene1, gene2))
print(np.array_equal(dis1, dis2))

True
True
True
