In [89]:
import pandas as pd
import numpy as np
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

from rdflib import Graph
import rdflib

### Define the entities

In [90]:
#Load Data e.g. Data from 2022(The CSV is from Semopenalex SPARQL)
data_work = pd.read_csv("Samples/Citedwork2018.csv", sep=",")

#Define the entities from the CSV (here: All works & cited work on Cancer Therapy from 2022)
entities = [entity for entity in data_work["citedwork"]]

#Conect all entities
entities_distinct = list(dict.fromkeys(entities))

In [91]:
print(len(entities_distinct))

1699


### Define the Knowledge Graph about the SPARQL-Endpoint
The embeddings are calculated for the Works, but the goal is to represent the concepts. So the Literals are the Concept Titles

In [92]:
knowledge_graph=KG(
    "https://semopenalex.org/sparql",
skip_predicates={"http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}, 
    literals=[
        ["https://semopenalex.org/property/hasConcept",
         "http://www.w3.org/2004/02/skos/core#prefLabel"]
        
    ]
)

In [93]:
print(knowledge_graph)

KG(location='https://semopenalex.org/sparql', skip_predicates={'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}, literals=[['https://semopenalex.org/property/hasConcept', 'http://www.w3.org/2004/02/skos/core#prefLabel']], fmt=None, mul_req=False, skip_verify=False, cache=TTLCache({}, maxsize=1024, currsize=0), _is_remote=True)


### Define the RDF2VecTransformer
 Embedder: Word2Vec (Standard)
 Walker: RandomWalker (Standard)
 Starts randomly at one node(entity) and then randomly moves to the neighbor node(other entities). These movements are recorded by the algorithm. For the algorithm there are these parameters:
     - max_walks: Specifies how many paths are to be generated per node (entity)
     - max_depth: Determines the maximum depth a random walker can reach while traversing the graph
     - with_reverse: With True also reversed edges can be used, with False not
     - n_jobs: Specifies many jobs (i.e. the number of paths that are generated simultaneously) to be controlled at the same time. The higher the value, the more processing power is required


In [94]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(max_walks=10, max_depth=4, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [95]:
#Calculate the Embeddings & create for each embedding the defined Literals
embeddings, literals = transformer.fit_transform(knowledge_graph, entities_distinct)

100%|██████████| 1699/1699 [41:04<00:00,  1.45s/it] 


Extracted 16990 walks for 1699 entities (2467.7279s)
Fitted 16990 walks (1.0068s)


100%|██████████| 1699/1699 [00:00<00:00, 1687455.01it/s]


Extracted 1699 literals for 1699 entities (253.8068s)


In [96]:
embeddings

[array([-4.5710284e-02,  5.5575173e-02,  2.4084382e-02,  4.9490076e-02,
         2.3058737e-02, -8.6917832e-02,  7.4896358e-02,  1.5293969e-01,
        -8.1257261e-02, -5.7414409e-02,  1.1800622e-02, -1.3568592e-01,
        -3.7634898e-02,  2.2526974e-02,  2.8417271e-02, -3.8297180e-02,
         8.8883840e-02, -7.0242728e-05, -6.0522798e-02, -1.7064418e-01,
         7.8726463e-02,  4.7030002e-02,  9.4142288e-02, -5.0941814e-02,
         3.3704858e-02,  5.6416709e-02, -3.9138034e-02,  6.8817794e-02,
        -6.5452404e-02,  1.5310669e-02,  7.6611452e-02, -7.2657146e-02,
         1.7168673e-02, -1.1056885e-01, -5.5261187e-02,  5.6791630e-02,
         6.1276708e-02,  1.9868482e-02, -6.0178556e-02, -5.4659564e-02,
         2.9623067e-02, -4.5222342e-02, -4.7664866e-02,  5.7690643e-02,
         6.8580791e-02, -1.1177688e-02, -5.1728293e-02, -4.4686697e-02,
         4.6193007e-02,  2.9933199e-02,  4.8985183e-02, -5.4647956e-02,
         7.8134396e-04, -1.3255294e-02, -4.9847297e-02, -2.84141

In [97]:
literals

[[('Medicine',
   'Internal medicine',
   'Surgery',
   'Cancer',
   'Radiation therapy',
   'Oncology',
   'Fluorouracil',
   'Gastroenterology',
   'Adverse effect',
   'Anal cancer',
   'Chills',
   'Tolerability',
   'Mitomycin C')],
 [('Medicine',
   'Biology',
   'Internal medicine',
   'Immunology',
   'Immune system',
   'Cancer immunotherapy',
   'Cancer',
   'Oncology',
   'Immunotherapy',
   'Cancer research',
   'Lung cancer',
   'Effector',
   'Innate immune system')],
 [('Medicine',
   'Internal medicine',
   'Pathology',
   'Virus',
   'Virology',
   'Immunohistochemistry',
   'Adverse effect',
   'Regimen',
   'Oncolytic virus',
   'Virotherapy',
   'Viral shedding',
   'Common Terminology Criteria for Adverse Events')],
 [('Biology',
   'Gene',
   'Genetics',
   'Antibody',
   'Molecular biology',
   'Virus',
   'Virology',
   'Genome',
   'Immunogenicity',
   'Recombinant DNA',
   'Fusion protein',
   'In silico',
   'Capsid',
   'Parvovirus',
   'Chimera (genetics)',

### Transform the embeddings from multiple dimensions to 2 dimensions

In [98]:
from sklearn.manifold import TSNE
X_tsne = TSNE(random_state=1).fit_transform(np.vstack(embeddings))



### Data-Cleaning

The literals are given in a nested list. This is useless for the DataFrame, so the inner list is resolved

In [99]:
import itertools
new_list = list(itertools.chain.from_iterable(literals))

Now we have a list with a tuple for each entity(work). The tuple contains for each entity(work) the concepts or the concept title

In [100]:
new_list

[('Medicine',
  'Internal medicine',
  'Surgery',
  'Cancer',
  'Radiation therapy',
  'Oncology',
  'Fluorouracil',
  'Gastroenterology',
  'Adverse effect',
  'Anal cancer',
  'Chills',
  'Tolerability',
  'Mitomycin C'),
 ('Medicine',
  'Biology',
  'Internal medicine',
  'Immunology',
  'Immune system',
  'Cancer immunotherapy',
  'Cancer',
  'Oncology',
  'Immunotherapy',
  'Cancer research',
  'Lung cancer',
  'Effector',
  'Innate immune system'),
 ('Medicine',
  'Internal medicine',
  'Pathology',
  'Virus',
  'Virology',
  'Immunohistochemistry',
  'Adverse effect',
  'Regimen',
  'Oncolytic virus',
  'Virotherapy',
  'Viral shedding',
  'Common Terminology Criteria for Adverse Events'),
 ('Biology',
  'Gene',
  'Genetics',
  'Antibody',
  'Molecular biology',
  'Virus',
  'Virology',
  'Genome',
  'Immunogenicity',
  'Recombinant DNA',
  'Fusion protein',
  'In silico',
  'Capsid',
  'Parvovirus',
  'Chimera (genetics)',
  'Parvoviridae',
  'Minute virus of mice'),
 ('Medicin

Next, the concept titles and the 2-dimensional embeddings are stored together in a DataFrame

In [101]:
data = pd.DataFrame({"Concept Title": new_list, "X":X_tsne[:,0], "Y":X_tsne[:,1]})

In [102]:
data

Unnamed: 0,Concept Title,X,Y
0,"(Medicine, Internal medicine, Surgery, Cancer,...",-12.653873,23.802774
1,"(Medicine, Biology, Internal medicine, Immunol...",19.152061,-11.799558
2,"(Medicine, Internal medicine, Pathology, Virus...",-2.317030,49.115406
3,"(Biology, Gene, Genetics, Antibody, Molecular ...",-2.078132,-7.308976
4,"(Medicine, Biology, Artificial intelligence, C...",-6.567599,-13.177185
...,...,...,...
1694,"(Biology, Gene, Genetics, DNA, Chromosome, Can...",-13.721778,-28.691380
1695,"(Medicine, Biology, Gene, Internal medicine, B...",-24.812325,27.423531
1696,"(Medicine, Biology, Gene, Internal medicine, A...",18.508585,-8.975373
1697,"(Medicine, Biology, Internal medicine, Chemist...",24.393097,37.693562


#### Tupel split

The problem now is that an entity (Work) has multiple concepts. For example Medicine, Internal medicine, ...
The next step is to split the tuples with the concept titles so that each element of the tuple has a separate row in the DataFrame

In [103]:
data_rows = []
# Iteration over 'Concept Title'
for i, tupel in enumerate(data['Concept Title']):
    # Iteration over the elements of the tuple
    for j in tupel:
        # Add the elements, X and Y values to rows
        data_rows.append({'Concept Title': j, 'X': data.at[i, 'X'], 'Y': data.at[i, 'Y']})

#Create new DataFrame
data_new = pd.DataFrame(data_rows)

In [104]:
data_new

Unnamed: 0,Concept Title,X,Y
0,Medicine,-12.653873,23.802774
1,Internal medicine,-12.653873,23.802774
2,Surgery,-12.653873,23.802774
3,Cancer,-12.653873,23.802774
4,Radiation therapy,-12.653873,23.802774
...,...,...,...
26828,Dihydropyrimidine dehydrogenase,5.995165,23.318941
26829,Capecitabine,5.995165,23.318941
26830,Prospective cohort study,5.995165,23.318941
26831,DPYD,5.995165,23.318941


#### Calculate the average

data_new now no longer contains any tuples, yet the concept titles are still contained multiple times in the DataFrame. Therefore, they are grouped in the next step and the average of each group is calculated.
The average could be weighted if it is clear how often an entity (work) has been "visited".

In [105]:
data_mean = data_new.groupby('Concept Title').mean()
data_mean.reset_index(inplace=True)

In [106]:
data_mean

Unnamed: 0,Concept Title,X,Y
0,,-30.742834,-15.838471
1,3D printing,-11.194816,-4.769473
2,3d printed,9.729849,-0.764546
3,A431 cells,26.321373,-11.129070
4,A549 cell,14.848048,6.029537
...,...,...,...
2868,s,-30.742834,-15.838471
2869,stat,14.191254,-28.270386
2870,t,-30.742834,-15.838471
2871,u,-30.742834,-15.838471


#### Set Cancer Therapy in Center

The DataFrame looks good so far, but looking through the data, it is noticeable that Cancer Therapy is not in the center. The reason for this is that the algorithm does not know that we want to calculate the embeddings for Cancer Therapy. Therefore, the concept Cancer Therapy is now placed in the coordinate origin

In [107]:
row_cancerTherapy = data_mean.loc[data_mean["Concept Title"].isin(['Cancer therapy'])]
row_index = row_cancerTherapy.index[0]

In [108]:
# Read X and Y value
x = data_mean.loc[row_index, 'X']
y = data_mean.loc[row_index, 'Y']

In [109]:
# Adjust X values of embeddings
if x > 0:
    for x_index, x_row in data_mean.iterrows():
        data_mean.at[x_index, 'X'] = x_row['X'] - x
elif x < 0:
    for x_index, x_row in data_mean.iterrows():
        data_mean.at[x_index, 'X'] = x_row['X'] + abs(x)

In [110]:
# Adjust Y values of embeddings
if y > 0:
    for y_index, y_row in data_mean.iterrows():
        data_mean.at[y_index, 'Y'] = y_row['Y'] - y
elif y < 0:
    for y_index, y_row in data_mean.iterrows():
        data_mean.at[y_index, 'Y'] = y_row['Y'] + abs(y)

In [111]:
data_mean

Unnamed: 0,Concept Title,X,Y
0,,-36.624420,-15.674702
1,3D printing,-17.076403,-4.605703
2,3d printed,3.848261,-0.600776
3,A431 cells,20.439785,-10.965301
4,A549 cell,8.966461,6.193306
...,...,...,...
2868,s,-36.624420,-15.674702
2869,stat,8.309666,-28.106615
2870,t,-36.624420,-15.674702
2871,u,-36.624420,-15.674702


#### Removal of concepts that are not meaningful

Now the embeddings are positioned correctly. However, there are empty fields, concept titles that consist only of "a" or "b".
Because Semopenalex can only search for 3 or more characters, the filter was also set to 3

In [112]:
data_selected = data_mean[data_mean['Concept Title'].str.len() >=3]

In [113]:
data_selected

Unnamed: 0,Concept Title,X,Y
1,3D printing,-17.076403,-4.605703
2,3d printed,3.848261,-0.600776
3,A431 cells,20.439785,-10.965301
4,A549 cell,8.966461,6.193306
5,ABCC1,-20.975786,-41.704536
...,...,...,...
2861,mitochondrial fusion,-36.603714,13.349359
2863,non-small cell lung cancer (NSCLC),14.677151,-15.762016
2866,p38 mitogen-activated protein kinases,-27.720409,-23.679230
2869,stat,8.309666,-28.106615


#### Removing concepts that are too large and irrelevant

In the DataFrame, many major subject areas are included, such as medicine, biology, ... These concepts are not much use for the display, because they are very large and only over concepts

In [114]:
big_concepts = ['Medicine', 'Biology', 'Internal medicine', 'Genetics', 'Cancer', 'Chemistry', 'Mathematics', 'Materials science', 'Algorithm', 'Computer science']
data_selected = data_selected[~data_selected['Concept Title'].isin(big_concepts)].reset_index(drop=True)
data_selected

Unnamed: 0,Concept Title,X,Y
0,3D printing,-17.076403,-4.605703
1,3d printed,3.848261,-0.600776
2,A431 cells,20.439785,-10.965301
3,A549 cell,8.966461,6.193306
4,ABCC1,-20.975786,-41.704536
...,...,...,...
2843,mitochondrial fusion,-36.603714,13.349359
2844,non-small cell lung cancer (NSCLC),14.677151,-15.762016
2845,p38 mitogen-activated protein kinases,-27.720409,-23.679230
2846,stat,8.309666,-28.106615


In [115]:
data_all = data_selected.copy()
data_all

Unnamed: 0,Concept Title,X,Y
0,3D printing,-17.076403,-4.605703
1,3d printed,3.848261,-0.600776
2,A431 cells,20.439785,-10.965301
3,A549 cell,8.966461,6.193306
4,ABCC1,-20.975786,-41.704536
...,...,...,...
2843,mitochondrial fusion,-36.603714,13.349359
2844,non-small cell lung cancer (NSCLC),14.677151,-15.762016
2845,p38 mitogen-activated protein kinases,-27.720409,-23.679230
2846,stat,8.309666,-28.106615


#### Data preparation for Visualization

For the visualization it is good to be able to sort the embeddings, therefore the distance to the center is calculated here.
For the calculation we used the root function of Numpy to find out the distance to the center using Pythagoras

In [116]:
data_all['distance'] = np.sqrt(data_all['X']**2 + data_all['Y']**2)

#### Add the number of publications

In [120]:
data_countWork = pd.read_csv("Samples/works-count.csv")
#data_countWork.groupby('conceptTitle').sum()
data_merged = pd.merge(data_all, data_countWork, left_on='Concept Title', right_on='conceptTitel')

In [121]:
data_merged

Unnamed: 0,Concept Title,X,Y,distance,conceptTitel,countWork
0,Abandonment (legal),-5.559546,4.988018,7.469195,Abandonment (legal),1
1,Ablation,-0.988450,0.805723,1.275234,Ablation,1
2,Absorbance,-15.025333,9.315926,17.679003,Absorbance,2
3,Absorption (acoustics),-5.842711,-3.612958,6.869552,Absorption (acoustics),1
4,Acetylation,2.201953,-28.772741,28.856874,Acetylation,1
...,...,...,...,...,...,...
1029,Xanthone,13.114241,25.683058,28.837524,Xanthone,1
1030,Zeta potential,6.690432,6.133778,9.076624,Zeta potential,1
1031,Zinc,-9.248442,16.375013,18.806242,Zinc,1
1032,microRNA,-2.000383,-1.112043,2.288705,microRNA,17


#### Create RDF-File

For the upload into the Metaphactory, the DataFrame has to be converted into an RDF file, for this we used the library rdflib
- For the declaration of the namespace we used an example URI ("http://example.com/")
- The definition of each concept is done in such a way that each concept (e.g. concept1) has been assigned:
    1. the type Concept
    2. the concept Title
    3. x-coordinate
    4. y-coordinate
    5. distance, to sort
    6. the year

- !!! Attention, for the distinction in the metaphactory each predicate (e.g. namespace.title_cited) must be unique, therefore the year is appended behind it


In [123]:
from rdflib import XSD

g = rdflib.Graph()
namespace = rdflib.Namespace("http://example.com/")
for i, row in data_merged.iterrows():
    s = rdflib.URIRef(f"http://example.com/concept{i}")
    g.add((s, rdflib.RDF.type, namespace.Concept))
    g.add((s, namespace.title_cited_new, rdflib.Literal(row['Concept Title'])))
    g.add((s, namespace.x_cited_new, rdflib.Literal(row['X'], datatype=XSD.float)))
    g.add((s, namespace.y_cited_new, rdflib.Literal(row['Y'], datatype=XSD.float)))
    g.add((s, namespace.distance_cited_new, rdflib.Literal(row['distance'], datatype=XSD.float)))
    g.add((s, namespace.worksCount_cited_new, rdflib.Literal(row['countWork'], datatype=XSD.integer)))
    g.add((s, namespace.year_cited_new, rdflib.Literal(2018)))

g.serialize(destination='Outcomes/embeddingscited.ttl', format='turtle')

<Graph identifier=Ncf229eb2a5a84aeaa6915084761270e9 (<class 'rdflib.graph.Graph'>)>