In [7]:
!ls ../../data

[34manalysis[m[m        [34mexploitation[m[m    [34mlanding[m[m         [34mtrusted[m[m
[34manalysis_output[m[m [34mformatted[m[m       [34mrelations[m[m


In [1]:
# --- Imports and Setup ---

import pandas as pd
import torch
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# PyKEEN imports
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import pykeen.version

# RDFLib for initial graph processing
import rdflib
from rdflib import Graph

print(f"PyKEEN version: {pykeen.version.get_version()}")
print(f"PyTorch version: {torch.__version__}")

# --- Configuration ---
KG_PATH = "../../data/exploitation/knowledge_graph.ttl"
OUTPUT_DIR = Path("../../data/analysis/embeddings")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TRIPLES_PATH = OUTPUT_DIR / "kg_triples_for_pykeen.tsv"

  from .autonotebook import tqdm as notebook_tqdm


PyKEEN version: 1.11.1
PyTorch version: 2.7.1


In [2]:
# --- Load and Filter Knowledge Graph for KGE Models ---

print(f"Loading full Knowledge Graph from: {KG_PATH}")
g = Graph()
try:
    g.parse(KG_PATH, format="turtle")
    print(f"Graph loaded successfully with {len(g)} triples.")
except FileNotFoundError:
    print(f"ERROR: Knowledge Graph file not found at {KG_PATH}. Please run the exploitation zone script first.")
    raise

# --- Data Preparation for PyKEEN ---
# KGE models like TransE and DistMult work with entity-relation-entity triples.
# We will filter the graph to keep ONLY these structural triples.
# We are completely ignoring triples with literal objects for this training phase.

print("\nFiltering graph to exclude triples with literal objects...")
entity_triples = []
for s, p, o in g:
    # The condition is simple: the subject (s) and object (o) must be URIs (entities).
    # The predicate (p) is always a URI.
    if isinstance(s, rdflib.URIRef) and isinstance(o, rdflib.URIRef):
        entity_triples.append((str(s), str(p), str(o)))

print(f"Filtered down to {len(entity_triples)} entity-relation-entity triples.")

# Save the filtered triples to a TSV file for PyKEEN
print(f"Saving filtered triples to: {TRIPLES_PATH}")
with open(TRIPLES_PATH, "w") as f:
    # Add a header for clarity, though PyKEEN doesn't strictly need it
    # f.write("head\trelation\ttail\n") 
    for s, p, o in entity_triples:
        f.write(f"{s}\t{p}\t{o}\n")

print("Data preparation for PyKEEN complete.")

Loading full Knowledge Graph from: ../../data/exploitation/knowledge_graph.ttl
Graph loaded successfully with 180513 triples.

Filtering graph to exclude triples with literal objects...
Filtered down to 78126 entity-relation-entity triples.
Saving filtered triples to: ../../data/analysis/embeddings/kg_triples_for_pykeen.tsv
Data preparation for PyKEEN complete.


In [3]:
# --- Create and Split PyKEEN Dataset ---

# Create a TriplesFactory from our TSV file
tf = TriplesFactory.from_path(TRIPLES_PATH, create_inverse_triples=True) # Adding inverse triples can help some models

# Split the data into training, validation, and testing sets
training_set, validation_set, testing_set = tf.split([0.8, 0.1, 0.1], random_state=42)

print("Dataset split successfully:")
print(f"  Training triples:   {training_set.num_triples}")
print(f"  Validation triples: {validation_set.num_triples}")
print(f"  Testing triples:    {testing_set.num_triples}")
print(f"  Total entities:     {tf.num_entities}")
print(f"  Total relations:    {tf.num_relations}")

Dataset split successfully:
  Training triples:   62500
  Validation triples: 7813
  Testing triples:    7813
  Total entities:     29332
  Total relations:    16


In [4]:
# --- Helper Function for Training and Evaluation ---

def run_kge_pipeline(training, validation, testing, model_name, embedding_dim=100, epochs=100):
    """
    Runs the PyKEEN pipeline for a given model and returns the results.
    """
    print(f"\n{'='*20} Starting Pipeline for {model_name} {'='*20}")
    
    result = pipeline(
        training=training,
        validation=validation,
        testing=testing,
        model=model_name,
        model_kwargs=dict(embedding_dim=embedding_dim),
        training_kwargs=dict(
            num_epochs=epochs,
            batch_size=256,
            use_tqdm_batch=False
        ),
        negative_sampler='basic',
        evaluation_kwargs=dict(batch_size=256),
        stopper='early',
        stopper_kwargs=dict(frequency=5, patience=3, metric='hits@10'),
        random_seed=42,
        device='cpu',
    )
    
    print(f"--- Pipeline for {model_name} complete. ---")
    return result

In [None]:
# --- Train and Evaluate Selected Models ---

results = {}

# --- Train TransE ---
transe_result = run_kge_pipeline(
    training=training_set,
    validation=validation_set,
    testing=testing_set,
    model_name='TransE',
    embedding_dim=50, # Can start with smaller embeddings
    epochs=100
)
results['TransE'] = transe_result
transe_result.save_to_directory(OUTPUT_DIR / 'transe_model')


# --- Train DistMult ---
distmult_result = run_kge_pipeline(
    training=training_set,
    validation=validation_set,
    testing=testing_set,
    model_name='DistMult',
    embedding_dim=50,
    epochs=100
)
results['DistMult'] = distmult_result
distmult_result.save_to_directory(OUTPUT_DIR / 'distmult_model')




INFO:pykeen.triples.triples_factory:Creating inverse triples.
Training epochs on cpu:   0%|          | 0/100 [00:00<?, ?epoch/s]INFO:pykeen.triples.triples_factory:Creating inverse triples.


In [None]:
# --- Compare Model Performance ---

all_metrics = []
for model_name, result in results.items():
    metrics = result.metric_results.to_df()
    test_metrics = metrics[metrics['Side'] == 'both'][metrics['Type'] == 'realistic'][metrics['Dataset'] == 'testing'].copy()
    test_metrics['Model'] = model_name
    all_metrics.append(test_metrics)

if all_metrics:
    results_df = pd.concat(all_metrics, ignore_index=True)
    display_cols = ['Model', 'Metric', 'Value']
    results_df_display = results_df[display_cols]
    pivot_df = results_df_display.pivot(index='Model', columns='Metric', values='Value')
    
    final_metrics = ['mean_reciprocal_rank', 'hits_at_1', 'hits_at_3', 'hits_at_5', 'hits_at_10']
    final_metrics_existing = [m for m in final_metrics if m in pivot_df.columns]
    
    print("\n\n" + "="*20 + " Final Model Comparison " + "="*20)
    print("Metrics evaluated on the testing set:")
    print(pivot_df[final_metrics_existing].to_string(float_format="%.4f"))

    # --- Plotting the results ---
    plot_df = pivot_df[final_metrics_existing].reset_index()
    melted_df = plot_df.melt(id_vars='Model', var_name='Metric', value_name='Score')
    
    plt.figure(figsize=(14, 8))
    sns.barplot(x='Metric', y='Score', hue='Model', data=melted_df)
    plt.title('KGE Model Performance Comparison (Link Prediction)', fontsize=16)
    plt.ylabel('Score')
    plt.xlabel('Evaluation Metric')
    plt.xticks(rotation=15)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plot_path = OUTPUT_DIR / 'model_comparison.png'
    plt.savefig(plot_path)
    print(f"\nComparison plot saved to: {plot_path}")
    plt.show()

else:
    print("No model results to display.")