In [1]:
import pandas as pd

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
from pyrdf2vec.graphs import Vertex
from pyrdf2vec.samplers import PageRankSampler

from rdflib import Graph
import rdflib

In [2]:
knowledge_graph = KG("Query_2020_sortiert_ohneAutor__mitTitel_editiert.ttl", skip_predicates= {"http://prismstandard.org/namespaces/basic/2.0/publicationDate", "http://purl.org/spar/fabio/hasPublicationYear"}, literals=[
    ["http://www.w3.org/2004/02/skos/core#prefLabel"],
])

In [3]:
for subj, pred, obj in rdflib.Graph().parse(
    "Query_2020_sortiert_ohneAutor__mitTitel_editiert.ttl", format="ttl"
):
    subj = Vertex(str(subj))
    obj = Vertex(str(obj))
    pred = Vertex(str(pred), predicate=True, vprev=subj, vnext=obj)
    #knowledge_graph.add_walk(subj, pred, obj)
    knowledge_graph.add_walk(obj, pred, subj)

    # print(subj.name)

#knowledge_graph

In [5]:
data = pd.read_csv("CSV_2020_sortiert.csv", sep=",")
entities = [entity for entity in data["relatedConceptsWork"]]
entities_distinct = list(dict.fromkeys(entities))
entities_distinct

['https://semopenalex.org/concept/C71924100',
 'https://semopenalex.org/concept/C126322002',
 'https://semopenalex.org/concept/C121608353',
 'https://semopenalex.org/concept/C98274493',
 'https://semopenalex.org/concept/C2983331546',
 'https://semopenalex.org/concept/C2778250585',
 'https://semopenalex.org/concept/C2780234812',
 'https://semopenalex.org/concept/C126838900',
 'https://semopenalex.org/concept/C192562407',
 'https://semopenalex.org/concept/C171250308',
 'https://semopenalex.org/concept/C136229726',
 'https://semopenalex.org/concept/C81288441',
 'https://semopenalex.org/concept/C143753070',
 'https://semopenalex.org/concept/C19527891',
 'https://semopenalex.org/concept/C2779820397',
 'https://semopenalex.org/concept/C2779949491',
 'https://semopenalex.org/concept/C3019816032',
 'https://semopenalex.org/concept/C513720949',
 'https://semopenalex.org/concept/C182606246',
 'https://semopenalex.org/concept/C86803240',
 'https://semopenalex.org/concept/C18903297',
 'https://sem

In [6]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(max_depth=4, max_walks=10)],
    verbose=1
)

In [7]:
embeddings, literals = transformer.fit_transform(knowledge_graph, entities_distinct)

100%|██████████| 2003/2003 [09:11<00:00,  3.63it/s]


Extracted 19805 walks for 2003 entities (553.4013s)


  0%|          | 0/2003 [00:00<?, ?it/s]

Fitted 19805 walks (0.7226s)


100%|██████████| 2003/2003 [00:00<00:00, 7495.01it/s]

Extracted 2003 literals for 2003 entities (0.2687s)





In [8]:
embeddings

[array([-0.12189107,  0.05123978,  0.05657202,  0.06954403, -0.1713304 ,
        -0.09899313,  0.10504299,  0.24684024, -0.14056848, -0.04463802,
         0.05871847,  0.03025903, -0.01439286,  0.08620734, -0.17048855,
        -0.06317417,  0.10021387, -0.05424181, -0.19395937, -0.23248583,
         0.03126024,  0.08511013,  0.19070923, -0.07102232,  0.03989666,
        -0.08466835,  0.08570877,  0.0385919 , -0.09077927,  0.09167073,
         0.08142269, -0.1520138 ,  0.17726795, -0.12545314, -0.08779445,
         0.04486979,  0.1199197 , -0.05659667, -0.07337215, -0.10355248,
        -0.06478576,  0.04236432, -0.17587006, -0.02032928,  0.0037214 ,
        -0.04573379, -0.06911426,  0.02410953,  0.07234255,  0.0791172 ,
        -0.06501773, -0.01549797,  0.02090495,  0.02359304, -0.07579219,
        -0.00770836,  0.08139125, -0.20558955,  0.00153547,  0.03257143,
         0.05241986,  0.00192779,  0.10272928, -0.04347827, -0.06192439,
         0.12555353, -0.02873925,  0.13905416, -0.1

In [9]:
literals

[[('https://semopenalex.org/concept/C71924100', 'Medicine')],
 [('https://semopenalex.org/concept/C126322002', 'Internal medicine')],
 [('Cancer', 'https://semopenalex.org/concept/C121608353')],
 [('https://semopenalex.org/concept/C98274493', 'Pharmacology')],
 [('Cancer therapy', 'https://semopenalex.org/concept/C2983331546')],
 [('https://semopenalex.org/concept/C2778250585', 'Curcumin')],
 [('https://semopenalex.org/concept/C2780234812', 'Cancer prevention')],
 [('Radiology', 'https://semopenalex.org/concept/C126838900')],
 [('https://semopenalex.org/concept/C192562407', 'Materials science')],
 [('Nanotechnology', 'https://semopenalex.org/concept/C171250308')],
 [('Biomedical engineering', 'https://semopenalex.org/concept/C136229726')],
 [('https://semopenalex.org/concept/C81288441', 'Ultrasonic sensor')],
 [('Ultrasound', 'https://semopenalex.org/concept/C143753070')],
 [('Medical physics', 'https://semopenalex.org/concept/C19527891')],
 [('https://semopenalex.org/concept/C27798203

In [10]:
pd.DataFrame(embeddings).to_csv("D:/robin/Desktop/embeddings_concepts_2020_2.csv", header=None, index=None)

In [9]:
pd.DataFrame(literals).to_csv("D:/robin/Desktop/literals_concepts_2020_2.csv", header=None, index=None)