In [2]:
import pandas as pd
import numpy as np

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

from rdflib import Graph
import rdflib

### Define the entities

In [3]:
#Define the entities
#Load Data e.g. Data from Concepts related to Cancer(The CSV is from Semopenalex SPARQL)
data = pd.read_csv("Samples/onlyConcepts.csv", sep=",")

#Define the entities from the CSV
entities = [entity for entity in data["concept"]]

### Define the Knowledge Graph about the SPARQL-Endpoint
The embeddings are calculated for the Concepts, so the Literals are the Concept Titles & the Count of Works

In [4]:
knowledge=KG(
    "https://semopenalex.org/sparql",
skip_predicates={"http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}, 
    literals=[
        ["http://www.w3.org/2004/02/skos/core#prefLabel"],
        [ "https://semopenalex.org/property/worksCount"],
        [ "https://semopenalex.org/property/citedByCount"],
    ]
)

In [5]:
print(knowledge )

KG(location='https://semopenalex.org/sparql', skip_predicates={'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}, literals=[['http://www.w3.org/2004/02/skos/core#prefLabel'], ['https://semopenalex.org/property/worksCount'], ['https://semopenalex.org/property/citedByCount']], fmt=None, mul_req=False, skip_verify=False, cache=TTLCache([], maxsize=1024, currsize=0), _is_remote=True)


### Define the RDF2VecTransformer
 Embedder: Word2Vec (Standard)
 Walker: RandomWalker (Standard)
 Starts randomly at one node(entity) and then randomly moves to the neighbor node(other entities). These movements are recorded by the algorithm. For the algorithm there are these parameters:
     - max_walks: Specifies how many paths are to be generated per node (entity)
     - max_depth: Determines the maximum depth a random walker can reach while traversing the graph
     - with_reverse: With True also reversed edges can be used, with False not
     - n_jobs: Specifies many jobs (i.e. the number of paths that are generated simultaneously) to be controlled at the same time. The higher the value, the more processing power is required

In [6]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(max_walks=10, max_depth=4, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [7]:
#Calculate the Embeddings & create for each embedding the defined Literals
embeddings, literals = transformer.fit_transform(knowledge, entities)

100%|██████████| 1122/1122 [14:16<00:00,  1.31it/s]


Extracted 11220 walks for 1122 entities (858.8786s)
Fitted 11220 walks (0.5486s)


100%|██████████| 1122/1122 [00:00<00:00, 553517.89it/s]


Extracted 1122 literals for 1122 entities (416.6516s)


In [None]:
embeddings

In [None]:
literals

### Transform the embeddings from multiple dimensions to 2 dimensions

In [11]:
from sklearn.manifold import TSNE
X_tsne = TSNE(random_state=1).fit_transform(np.vstack(embeddings))

### Data preparation for Visualization

The Concept Titles and des Count of Works, are saved in a list, but to create a DataFrame together with the embeddings, the values have to be separated

In [20]:
conceptTitle = [i[0] for i in literals]
worksCount = [i[1] for i in literals]

Next, the concept titles, count of works and the 2-dimensional embeddings are stored together in a DataFrame

In [26]:
data = pd.DataFrame({"Concept Title": conceptTitle, "X":X_tsne[:,0], "Y":X_tsne[:,1], "countWork":worksCount})

In [None]:
data

For the visualization it is good to be able to sort the embeddings, therefore the distance to the center is calculated here.
For the calculation we used the root function of Numpy to find out the distance to the center using Pythagoras

In [28]:
data['distance'] = np.sqrt(data['X']**2 + data['Y']**2)

#### Create RDF-File

For the upload into the Metaphactory, the DataFrame has to be converted into an RDF file, for this we used the library rdflib
- For the declaration of the namespace we used an example URI ("http://example.com/")
- The definition of each concept is done in such a way that each concept (e.g. concept1) has been assigned:
    1. the type Concept
    2. the concept Title
    3. x-coordinate
    4. y-coordinate
    5. distance, to sort

- !!! Attention, for the distinction in the metaphactory each predicate (e.g. namespace.title) must be unique

In [29]:
from rdflib import XSD

g = rdflib.Graph()
namespace = rdflib.Namespace("http://example.com/")
for i, row in data.iterrows():
    s = rdflib.URIRef(f"http://example.com/concept{i}")
    g.add((s, rdflib.RDF.type, namespace.Concept))
    g.add((s, namespace.title, rdflib.Literal(row['Concept Title'])))
    g.add((s, namespace.x, rdflib.Literal(row['X'], datatype=XSD.float)))
    g.add((s, namespace.y, rdflib.Literal(row['Y'], datatype=XSD.float)))
    g.add((s, namespace.distance, rdflib.Literal(row['distance'], datatype=XSD.float)))
    g.add((s, namespace.worksCount, rdflib.Literal(row['countWork'], datatype=XSD.integer)))

g.serialize(destination='Outcomes/embeddings_conceptsOnly.ttl', format='turtle')

<Graph identifier=N3c505c056822423fa4cfb3e6b3580ef5 (<class 'rdflib.graph.Graph'>)>