In [1]:
import pandas as pd
import numpy as np
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

from rdflib import Graph
import rdflib

### Define the entities

In [2]:
#Load Data e.g. Data from 2022(The CSV is from Semopenalex SPARQL)
data_work = pd.read_csv("Samples/work_2018.csv", sep=",")

#Define the entities from the CSV (here: All works & cited work on Cancer Therapy from 2022)
entities = [entity for entity in data_work["work"]]

#Conect all entities
entities_distinct = list(dict.fromkeys(entities))

In [None]:
print(len(entities_distinct))

### Define the Knowledge Graph about the SPARQL-Endpoint
The embeddings are calculated for the Works, but the goal is to represent the concepts. So the Literals are the Concept Titles

In [3]:
knowledge_graph=KG(
    "https://semopenalex.org/sparql",
skip_predicates={"http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}, 
    literals=[
        ["https://semopenalex.org/property/hasConcept",
         "http://www.w3.org/2004/02/skos/core#prefLabel"]
        
    ]
)

In [None]:
print(knowledge_graph)

### Define the RDF2VecTransformer
 Embedder: Word2Vec (Standard)
 Walker: RandomWalker (Standard)
 Starts randomly at one node(entity) and then randomly moves to the neighbor node(other entities). These movements are recorded by the algorithm. For the algorithm there are these parameters:
     - max_walks: Specifies how many paths are to be generated per node (entity)
     - max_depth: Determines the maximum depth a random walker can reach while traversing the graph
     - with_reverse: With True also reversed edges can be used, with False not
     - n_jobs: Specifies many jobs (i.e. the number of paths that are generated simultaneously) to be controlled at the same time. The higher the value, the more processing power is required


In [4]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(max_walks=10, max_depth=4, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [None]:
#Calculate the Embeddings & create for each embedding the defined Literals
embeddings, literals = transformer.fit_transform(knowledge_graph, entities_distinct)

In [None]:
embeddings

In [None]:
literals

### Transform the embeddings from multiple dimensions to 2 dimensions

In [9]:
from sklearn.manifold import TSNE
X_tsne = TSNE(random_state=1).fit_transform(np.vstack(embeddings))

### Data-Cleaning

The literals are given in a nested list. This is useless for the DataFrame, so the inner list is resolved

In [10]:
import itertools
new_list = list(itertools.chain.from_iterable(literals))

Now we have a list with a tuple for each entity(work). The tuple contains for each entity(work) the concepts or the concept title

In [None]:
new_list

Next, the concept titles and the 2-dimensional embeddings are stored together in a DataFrame

In [12]:
data = pd.DataFrame({"Concept Title": new_list, "X":X_tsne[:,0], "Y":X_tsne[:,1]})

In [None]:
data

#### Tupel split

The problem now is that an entity (Work) has multiple concepts. For example Medicine, Internal medicine, ...
The next step is to split the tuples with the concept titles so that each element of the tuple has a separate row in the DataFrame

In [14]:
data_rows = []
# Iteration over 'Concept Title'
for i, tupel in enumerate(data['Concept Title']):
    # Iteration over the elements of the tuple
    for j in tupel:
        # Add the elements, X and Y values to rows
        data_rows.append({'Concept Title': j, 'X': data.at[i, 'X'], 'Y': data.at[i, 'Y']})

#Create new DataFrame
data_new = pd.DataFrame(data_rows)

In [None]:
data_new

#### Calculate the average

data_new now no longer contains any tuples, yet the concept titles are still contained multiple times in the DataFrame. Therefore, they are grouped in the next step and the average of each group is calculated.
The average could be weighted if it is clear how often an entity (work) has been "visited".

In [16]:
data_mean = data_new.groupby('Concept Title').mean()
data_mean.reset_index(inplace=True)

In [None]:
data_mean

#### Set Cancer Therapy in Center

The DataFrame looks good so far, but looking through the data, it is noticeable that Cancer Therapy is not in the center. The reason for this is that the algorithm does not know that we want to calculate the embeddings for Cancer Therapy. Therefore, the concept Cancer Therapy is now placed in the coordinate origin

In [18]:
row_cancerTherapy = data_mean.loc[data_mean["Concept Title"].isin(['Cancer therapy'])]
row_index = row_cancerTherapy.index[0]

In [19]:
# Read X and Y value
x = data_mean.loc[row_index, 'X']
y = data_mean.loc[row_index, 'Y']

In [20]:
# Adjust X values of embeddings
if x > 0:
    for x_index, x_row in data_mean.iterrows():
        data_mean.at[x_index, 'X'] = x_row['X'] - x
elif x < 0:
    for x_index, x_row in data_mean.iterrows():
        data_mean.at[x_index, 'X'] = x_row['X'] + abs(x)

In [21]:
# Adjust Y values of embeddings
if y > 0:
    for y_index, y_row in data_mean.iterrows():
        data_mean.at[y_index, 'Y'] = y_row['Y'] - y
elif y < 0:
    for y_index, y_row in data_mean.iterrows():
        data_mean.at[y_index, 'Y'] = y_row['Y'] + abs(y)

In [None]:
data_mean

#### Removal of concepts that are not meaningful

Now the embeddings are positioned correctly. However, there are empty fields, concept titles that consist only of "a" or "b".
Because Semopenalex can only search for 3 or more characters, the filter was also set to 3

In [24]:
data_selected = data_mean[data_mean['Concept Title'].str.len() >=3]

In [None]:
data_selected

#### Removing concepts that are too large and irrelevant

In the DataFrame, many major subject areas are included, such as medicine, biology, ... These concepts are not much use for the display, because they are very large and only over concepts

In [None]:
big_concepts = ['Medicine', 'Biology', 'Internal medicine', 'Genetics', 'Cancer', 'Chemistry', 'Mathematics', 'Materials science', 'Algorithm', 'Computer science']
data_selected = data_selected[~data_selected['Concept Title'].isin(big_concepts)].reset_index(drop=True)
data_selected

In [None]:
data_2022 = data_selected.copy()
data_2022

#### Show the Top Embeddings

For visualization, it will be difficult to visualize over 1000 concepts in one chart, so the next function determines the top embeddings(the ones with the smallest distance to Cancer Therapy) and outputs them

In [None]:
top_embeddings = 100
data_2022['X_abs'] = data_2022['X'].abs()
data_2022['Y_abs'] = data_2022['Y'].abs()
data_2022['Abs_Gesamt'] = data_2022['X_abs'] + data_2022['Y_abs']
data_2022 = data_2022.nsmallest(top_embeddings, 'Abs_Gesamt')
data_2022.drop(['X_abs', 'Y_abs'], axis='columns', inplace=True)
data_2022

#### Add the number of publications

In [None]:
data_countWork_2022 = pd.read_csv("Samples/2022/count_work_2022.csv")
data_countRelatedWork_2022 = pd.read_csv("Samples/2022/count_relatedWork_2022.csv")
data_count_2022 = data_countWork_2022.append(data_countRelatedWork_2022)
data_count_2022 = data_count_2022.groupby('conceptTitle').sum()
data_2022_merged = pd.merge(data_2022, data_count_2022, left_on='Concept Title', right_on='conceptTitle')

In [None]:
data_2022_merged

#### Create RDF-File

For the upload into the Metaphactory, the DataFrame has to be converted into an RDF file, for this we used the library rdflib
- For the declaration of the namespace we used an example URI ("http://example.com/")
- The definition of each concept is done in such a way that each concept (e.g. concept1) has been assigned:
    1. the type Concept
    2. the concept Title
    3. x-coordinate
    4. y-coordinate
    5. distance, to sort
    6. the year

- !!! Attention, for the distinction in the metaphactory each predicate (e.g. namespace.title_2018) must be unique, therefore the year is appended behind it


In [33]:
from rdflib import XSD

g = rdflib.Graph()
namespace = rdflib.Namespace("http://example.com/")
for i, row in data_2022_merged.iterrows():
    s = rdflib.URIRef(f"http://example.com/concept{i}")
    g.add((s, rdflib.RDF.type, namespace.Concept))
    g.add((s, namespace.title_2018_new, rdflib.Literal(row['Concept Title'])))
    g.add((s, namespace.x_2018_new, rdflib.Literal(row['X'], datatype=XSD.float)))
    g.add((s, namespace.y_2018_new, rdflib.Literal(row['Y'], datatype=XSD.float)))
    g.add((s, namespace.distance_2018_new, rdflib.Literal(row['Abs_Gesamt'], datatype=XSD.float)))
    g.add((s, namespace.worksCount_2018_new, rdflib.Literal(row['countWork'], datatype=XSD.integer)))
    g.add((s, namespace.year_2018_new, rdflib.Literal(2018)))

g.serialize(destination='Outcomes/embeddings_2018.ttl', format='turtle')

<Graph identifier=N2836aafbc24046ed87f482c51a45016a (<class 'rdflib.graph.Graph'>)>