# PyKeen

In [6]:
pip install pykeen rdflib torch scikit-learn




In [7]:
import zipfile
from rdflib import Graph
import pandas as pd
from pykeen.triples import TriplesFactory
import numpy as np
from pykeen.pipeline import pipeline

In [8]:
kgPath = "ESCOdataset.rdf.zip"
rdf_inner_name = "esco-v1.2.0.rdf" # idk why this name is different but it is -_-
g = Graph()
g.parse("esco.nt", format = "nt")

skill_query = """
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX esco: <http://data.europa.eu/esco/model#>

SELECT ?skill ?label
WHERE {
  ?skill a esco:Skill ;
         skos:prefLabel ?label .
  FILTER (lang(?label) = "en")
}
"""

skills = list(g.query(skill_query))
print(f"Number of triples: {len (g)}")
print("Number of skills:", len(skills))

skill_uris = []
skill_labels = []

for row in skills:
    skill_uris.append(str(row.skill))
    skill_labels.append(str(row.label))

Number of triples: 8756008
Number of skills: 14257


We only extract those tuples related to the skills directly (i.e. direct neighbours), because if we try to learn to model for all the data, it wants 377.gib allocated :).

In [9]:
import pandas as pd

skill_uri_set = set(skill_uris)

triples = []

for s, p, o in g:
    s = str(s)
    p = str(p)
    o = str(o)
    
    # Keep triples where at least one side is a skill
    if s in skill_uri_set or o in skill_uri_set:
        triples.append((s, p, o))

print("Triples involving at least one skill:", len(triples))

df_triples = pd.DataFrame(triples, columns=["head", "relation", "tail"])

Triples involving at least one skill: 2440814


In [10]:
from pykeen.triples import TriplesFactory
import numpy as np

# PyKEEN expects shape (n_triples, 3) as a numpy array of strings
labeled_triples = df_triples[["head", "relation", "tail"]].values.astype(str)

tf = TriplesFactory.from_labeled_triples(labeled_triples)
print("#entities:", tf.num_entities)
print("#relations:", tf.num_relations)

#entities: 1738720
#relations: 30


In [None]:
train_tf, test_tf, valid_tf = tf.split([0.8, 0.1, 0.1])

result = pipeline(
    training=train_tf,
    testing=test_tf,
    validation=valid_tf,
    model="ComplEx",             # or "TransE", "DistMult", ...
    model_kwargs=dict(embedding_dim=100),
    training_kwargs=dict(
        num_epochs=50,           # start small to test
        batch_size=1024,
    ),
    stopper="early",             # optional early stopping
    random_seed = 42,
    use_tqdm = True
)

model = result.model

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [193588, 241537, 241538]
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: C:\Users\Kai\.data\pykeen\checkpoints\best-model-weights-c5431354-6a82-4c4f-90cc-8d99e9ab5cab.pt


Training epochs on cpu:   0%|          | 0/50 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0.00/1.89k [00:00<?, ?batch/s]

In [None]:
entity_to_id = tf.entity_to_id

skill_entity_ids = []
skill_entity_labels = []
skill_entity_uris_in_tf = []

for uri, label in zip(skill_uris, skill_labels):
    if uri in entity_to_id:
        skill_entity_ids.append(entity_to_id[uri])
        skill_entity_labels.append(label)
        skill_entity_uris_in_tf.append(uri)

print("Skills that appear as entities in the TriplesFactory:", len(skill_entity_ids))

skill_entity_ids_tensor = torch.as_tensor(skill_entity_ids, dtype=torch.long)