From 58d9836583e8a5a529820175430f6c1614f58db1 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 15:50:49 +0100 Subject: [PATCH 01/26] [tools/rdfconverter] Add load_rdf_subclasses func --- odml/tools/rdf_converter.py | 45 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 5ffb4c6d..7041b1ce 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -3,12 +3,13 @@ import yaml from io import StringIO -from os.path import dirname, abspath from rdflib import Graph, Literal, URIRef from rdflib.graph import Seq from rdflib.namespace import XSD, RDF import odml + +from ..doc import BaseDocument from ..format import Format, Document, Section, Property from .dict_parser import DictReader from .parser_utils import ParserException @@ -23,6 +24,31 @@ odmlns = Format.namespace() +def load_rdf_subclasses(): + """ + load_rdf_subclasses loads odml section types to RDF Section subclass types + mappings from a file and returns the mapping as a dictionary. + Will return an empty dictionary, if the Subclasses file cannot be loaded. + + :return: Dictionary of the form {'Section type': 'RDF class type'} + """ + section_subclasses = {} + + subclass_file = os.path.join(odml.__path__[0], 'resources', 'section_subclasses.yaml') + + if not os.path.isfile(subclass_file): + print("[Warning] Could not find subclass file '%s'" % subclass_file) + return section_subclasses + + with open(subclass_file, "r") as yaml_file: + try: + section_subclasses = yaml.load(yaml_file) + except yaml.parser.ParserError as err: + print("[Error] Loading RDF subclass file: %s" % err) + + return section_subclasses + + class RDFWriter(object): """ A writer to parse odML files into RDF documents. @@ -36,25 +62,12 @@ def __init__(self, odml_documents): """ :param odml_documents: list of odml documents """ - self.docs = odml_documents if not isinstance(odml_documents, odml.doc.BaseDocument) else [odml_documents] + self.docs = odml_documents if not isinstance(odml_documents, BaseDocument) else [odml_documents] self.hub_root = None self.g = Graph() self.g.bind("odml", odmlns) - self.section_subclasses = {} - - subclass_path = os.path.join(odml.__path__[0], 'resources', - 'section_subclasses.yaml') - - if os.path.isfile(subclass_path): - with open(subclass_path, "r") as f: - try: - self.section_subclasses = yaml.load(f) - except yaml.parser.ParserError as err: - print(err) - return - else: - print("[Warning] Could not find subclass file '%s'" % subclass_path) + self.section_subclasses = load_rdf_subclasses() def convert_to_rdf(self): self.hub_root = URIRef(odmlns.Hub) From 2b843a4388033f134eced864b2efd136781fc35d Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 16:10:49 +0100 Subject: [PATCH 02/26] [tools/rdfconverter] Fix len as condition occurences --- odml/tools/rdf_converter.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 7041b1ce..34422270 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -126,21 +126,21 @@ def save_element(self, e, node=None): # generating nodes for entities: sections, properties and bags of values elif (isinstance(fmt, Document.__class__) or isinstance(fmt, Section.__class__)) and \ - k == 'sections' and len(getattr(e, k)) > 0: + k == 'sections' and getattr(e, k): sections = getattr(e, k) for s in sections: node = URIRef(odmlns + unicode(s.id)) self.g.add((curr_node, fmt.rdf_map(k), node)) self.save_element(s, node) elif isinstance(fmt, Section.__class__) and \ - k == 'properties' and len(getattr(e, k)) > 0: + k == 'properties' and getattr(e, k): properties = getattr(e, k) for p in properties: node = URIRef(odmlns + unicode(p.id)) self.g.add((curr_node, fmt.rdf_map(k), node)) self.save_element(p, node) elif isinstance(fmt, Property.__class__) and \ - k == 'value' and len(getattr(e, fmt.map(k))) > 0: + k == 'value' and getattr(e, fmt.map(k)): # "value" needs to be mapped to its appropriate # Property library attribute. values = getattr(e, fmt.map(k)) @@ -271,7 +271,7 @@ def parse_document(self, doc_uri): elif attr[0] == "id": doc_attrs[attr[0]] = doc_uri.split("#", 1)[1] else: - if len(elems) > 0: + if elems: doc_attrs[attr[0]] = unicode(elems[0].toPython()) return {'Document': doc_attrs, 'odml-version': FORMAT_VERSION} @@ -293,7 +293,7 @@ def parse_section(self, sec_uri): elif attr[0] == "id": sec_attrs[attr[0]] = sec_uri.split("#", 1)[1] else: - if len(elems) > 0: + if elems: sec_attrs[attr[0]] = unicode(elems[0].toPython()) self._check_mandatory_attrs(sec_attrs) return sec_attrs @@ -303,14 +303,14 @@ def parse_property(self, prop_uri): prop_attrs = {} for attr in rdf_prop.rdf_map_items: elems = list(self.g.objects(subject=prop_uri, predicate=attr[1])) - if attr[0] == "value" and len(elems) > 0: + if attr[0] == "value" and elems: prop_attrs[attr[0]] = [] # rdflib does not respect order with RDF.li items yet, see comment above # support both RDF.li and rdf:_nnn for now. # Remove rdf:_nnn once rdflib respects RDF.li order in an RDF.Seq obj. values = list(self.g.objects(subject=elems[0], predicate=RDF.li)) - if len(values) > 0: + if values: for v in values: prop_attrs[attr[0]].append(v.toPython()) else: @@ -322,7 +322,7 @@ def parse_property(self, prop_uri): elif attr[0] == "id": prop_attrs[attr[0]] = prop_uri.split("#", 1)[1] else: - if len(elems) > 0: + if elems: prop_attrs[attr[0]] = unicode(elems[0].toPython()) self._check_mandatory_attrs(prop_attrs) return prop_attrs From e83dafea2df57a52b72c49f27d752f2022dbce5b Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 16:12:30 +0100 Subject: [PATCH 03/26] [tools/rdfconverter] Simplify if statements --- odml/tools/rdf_converter.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 34422270..9814bbb3 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -270,9 +270,8 @@ def parse_document(self, doc_uri): doc_attrs[attr[0]].append(self.parse_section(s)) elif attr[0] == "id": doc_attrs[attr[0]] = doc_uri.split("#", 1)[1] - else: - if elems: - doc_attrs[attr[0]] = unicode(elems[0].toPython()) + elif elems: + doc_attrs[attr[0]] = unicode(elems[0].toPython()) return {'Document': doc_attrs, 'odml-version': FORMAT_VERSION} @@ -292,9 +291,9 @@ def parse_section(self, sec_uri): sec_attrs[attr[0]].append(self.parse_property(p)) elif attr[0] == "id": sec_attrs[attr[0]] = sec_uri.split("#", 1)[1] - else: - if elems: - sec_attrs[attr[0]] = unicode(elems[0].toPython()) + elif elems: + sec_attrs[attr[0]] = unicode(elems[0].toPython()) + self._check_mandatory_attrs(sec_attrs) return sec_attrs @@ -321,9 +320,9 @@ def parse_property(self, prop_uri): elif attr[0] == "id": prop_attrs[attr[0]] = prop_uri.split("#", 1)[1] - else: - if elems: - prop_attrs[attr[0]] = unicode(elems[0].toPython()) + elif elems: + prop_attrs[attr[0]] = unicode(elems[0].toPython()) + self._check_mandatory_attrs(prop_attrs) return prop_attrs From 90882a8e987d094fe95fe6294d044ea93fa2f158 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 16:33:37 +0100 Subject: [PATCH 04/26] [tools/rdfconverter] Fix invalid-name occurences --- odml/tools/rdf_converter.py | 155 ++++++++++++++++++------------------ test/test_rdf_writer.py | 84 +++++++++---------- 2 files changed, 120 insertions(+), 119 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 9814bbb3..cbec2544 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -21,7 +21,7 @@ except NameError: unicode = str -odmlns = Format.namespace() +ODML_NS = Format.namespace() def load_rdf_subclasses(): @@ -64,128 +64,128 @@ def __init__(self, odml_documents): """ self.docs = odml_documents if not isinstance(odml_documents, BaseDocument) else [odml_documents] self.hub_root = None - self.g = Graph() - self.g.bind("odml", odmlns) + self.graph = Graph() + self.graph.bind("odml", ODML_NS) self.section_subclasses = load_rdf_subclasses() def convert_to_rdf(self): - self.hub_root = URIRef(odmlns.Hub) + self.hub_root = URIRef(ODML_NS.Hub) if self.docs: for doc in self.docs: self.save_element(doc) - return self.g + return self.graph - def save_element(self, e, node=None): + def save_element(self, odml_elem, node=None): """ Save the current element to the RDF graph - :param e: current element + :param odml_elem: current element :param node: A node to pass the earlier created node to inner elements :return: the RDF graph """ - fmt = e.format() + fmt = odml_elem.format() if not node: - curr_node = URIRef(odmlns + unicode(e.id)) + curr_node = URIRef(ODML_NS + unicode(odml_elem.id)) else: curr_node = node if fmt.name == "section": - s = self._get_section_subclass(e) - u = s if s else fmt.rdf_type - self.g.add((curr_node, RDF.type, URIRef(u))) + sub_sec = self._get_section_subclass(odml_elem) + sec_type = sub_sec if sub_sec else fmt.rdf_type + self.graph.add((curr_node, RDF.type, URIRef(sec_type))) else: - self.g.add((curr_node, RDF.type, URIRef(fmt.rdf_type))) + self.graph.add((curr_node, RDF.type, URIRef(fmt.rdf_type))) # adding doc to the hub if isinstance(fmt, Document.__class__): - self.g.add((self.hub_root, odmlns.hasDocument, curr_node)) + self.graph.add((self.hub_root, ODML_NS.hasDocument, curr_node)) # If available add the documents filename to the document node # so we can identify where the data came from. - if hasattr(e, "_origin_file_name"): - self.g.add((curr_node, odmlns.hasFileName, Literal(e._origin_file_name))) + if hasattr(odml_elem, "_origin_file_name"): + self.graph.add((curr_node, ODML_NS.hasFileName, Literal(odml_elem._origin_file_name))) for k in fmt.rdf_map_keys: if k == 'id': continue elif (isinstance(fmt, Document.__class__) or isinstance(fmt, Section.__class__)) and k == "repository": - terminology_url = getattr(e, k) + terminology_url = getattr(odml_elem, k) if terminology_url is None or not terminology_url: continue terminology_node = self._get_terminology_by_value(terminology_url) if terminology_node: - self.g.add((curr_node, fmt.rdf_map(k), terminology_node)) + self.graph.add((curr_node, fmt.rdf_map(k), terminology_node)) else: # adding terminology to the hub and to link with the doc - node = URIRef(odmlns + unicode(uuid.uuid4())) - self.g.add((node, RDF.type, URIRef(terminology_url))) - self.g.add((self.hub_root, odmlns.hasTerminology, node)) - self.g.add((curr_node, fmt.rdf_map(k), node)) + node = URIRef(ODML_NS + unicode(uuid.uuid4())) + self.graph.add((node, RDF.type, URIRef(terminology_url))) + self.graph.add((self.hub_root, ODML_NS.hasTerminology, node)) + self.graph.add((curr_node, fmt.rdf_map(k), node)) # generating nodes for entities: sections, properties and bags of values elif (isinstance(fmt, Document.__class__) or isinstance(fmt, Section.__class__)) and \ - k == 'sections' and getattr(e, k): - sections = getattr(e, k) - for s in sections: - node = URIRef(odmlns + unicode(s.id)) - self.g.add((curr_node, fmt.rdf_map(k), node)) - self.save_element(s, node) + k == 'sections' and getattr(odml_elem, k): + sections = getattr(odml_elem, k) + for curr_sec in sections: + node = URIRef(ODML_NS + unicode(curr_sec.id)) + self.graph.add((curr_node, fmt.rdf_map(k), node)) + self.save_element(curr_sec, node) elif isinstance(fmt, Section.__class__) and \ - k == 'properties' and getattr(e, k): - properties = getattr(e, k) - for p in properties: - node = URIRef(odmlns + unicode(p.id)) - self.g.add((curr_node, fmt.rdf_map(k), node)) - self.save_element(p, node) + k == 'properties' and getattr(odml_elem, k): + properties = getattr(odml_elem, k) + for curr_prop in properties: + node = URIRef(ODML_NS + unicode(curr_prop.id)) + self.graph.add((curr_node, fmt.rdf_map(k), node)) + self.save_element(curr_prop, node) elif isinstance(fmt, Property.__class__) and \ - k == 'value' and getattr(e, fmt.map(k)): + k == 'value' and getattr(odml_elem, fmt.map(k)): # "value" needs to be mapped to its appropriate # Property library attribute. - values = getattr(e, fmt.map(k)) - seq = URIRef(odmlns + unicode(uuid.uuid4())) - self.g.add((seq, RDF.type, RDF.Seq)) - self.g.add((curr_node, fmt.rdf_map(k), seq)) + values = getattr(odml_elem, fmt.map(k)) + seq = URIRef(ODML_NS + unicode(uuid.uuid4())) + self.graph.add((seq, RDF.type, RDF.Seq)) + self.graph.add((curr_node, fmt.rdf_map(k), seq)) + # rdflib so far does not respect RDF:li item order # in RDF:Seq on loading so we have to use custom # numbered Node elements for now. Once rdflib upgrades # this should be reversed to RDF:li again! # see https://github.com/RDFLib/rdflib/issues/280 # -- keep until supported - # bag = URIRef(odmlns + unicode(uuid.uuid4())) - # self.g.add((bag, RDF.type, RDF.Bag)) - # self.g.add((curr_node, fmt.rdf_map(k), bag)) - # for v in values: - # self.g.add((bag, RDF.li, Literal(v))) - + # bag = URIRef(ODML_NS + unicode(uuid.uuid4())) + # self.graph.add((bag, RDF.type, RDF.Bag)) + # self.graph.add((curr_node, fmt.rdf_map(k), bag)) + # for curr_val in values: + # self.graph.add((bag, RDF.li, Literal(curr_val))) counter = 1 - for v in values: + for curr_val in values: pred = "%s_%s" % (unicode(RDF), counter) - self.g.add((seq, URIRef(pred), Literal(v))) + self.graph.add((seq, URIRef(pred), Literal(curr_val))) counter = counter + 1 # adding entities' properties else: - val = getattr(e, k) + val = getattr(odml_elem, k) if val is None or not val: continue elif k == 'date': - self.g.add((curr_node, fmt.rdf_map(k), Literal(val, datatype=XSD.date))) + self.graph.add((curr_node, fmt.rdf_map(k), Literal(val, datatype=XSD.date))) else: - self.g.add((curr_node, fmt.rdf_map(k), Literal(val))) - return self.g + self.graph.add((curr_node, fmt.rdf_map(k), Literal(val))) + return self.graph def _get_terminology_by_value(self, url): - return self.g.value(predicate=RDF.type, object=URIRef(url)) + return self.graph.value(predicate=RDF.type, object=URIRef(url)) - def _get_section_subclass(self, e): + def _get_section_subclass(self, elem): """ :return: RDF identifier of section subclass type if present in section_subclasses dict """ - sec_type = getattr(e, "type") + sec_type = getattr(elem, "type") if sec_type and sec_type in self.section_subclasses: - return odmlns[self.section_subclasses[sec_type]] + return ODML_NS[self.section_subclasses[sec_type]] else: return None @@ -214,8 +214,9 @@ def write_file(self, filename, rdf_format="turtle"): filename_ext = filename if filename.find(RDFConversionFormats.get(rdf_format)) < 0: filename_ext += RDFConversionFormats.get(rdf_format) - with open(filename_ext, "w") as wFile: - wFile.write(data) + + with open(filename_ext, "w") as out_file: + out_file.write(data) class RDFReader(object): @@ -231,14 +232,14 @@ class RDFReader(object): def __init__(self, filename=None, doc_format=None): self.docs = [] # list of parsed odml docs if filename and doc_format: - self.g = Graph().parse(source=filename, format=doc_format) + self.graph = Graph().parse(source=filename, format=doc_format) def to_odml(self): """ :return: list of converter odml documents """ - docs_uris = list(self.g.objects(subject=URIRef(odmlns.Hub), - predicate=odmlns.hasDocument)) + docs_uris = list(self.graph.objects(subject=URIRef(ODML_NS.Hub), + predicate=ODML_NS.hasDocument)) for doc in docs_uris: par = self.parse_document(doc) par_doc = DictReader().to_odml(par) @@ -247,15 +248,15 @@ def to_odml(self): return self.docs def from_file(self, filename, doc_format): - self.g = Graph().parse(source=filename, format=doc_format) + self.graph = Graph().parse(source=filename, format=doc_format) docs = self.to_odml() - for d in docs: + for curr_doc in docs: # Provide original file name via the document - d._origin_file_name = os.path.basename(filename) + curr_doc._origin_file_name = os.path.basename(filename) return docs def from_string(self, file, doc_format): - self.g = Graph().parse(source=StringIO(file), format=doc_format) + self.graph = Graph().parse(source=StringIO(file), format=doc_format) return self.to_odml() # TODO check mandatory attrs @@ -263,11 +264,11 @@ def parse_document(self, doc_uri): rdf_doc = Document doc_attrs = {} for attr in rdf_doc.rdf_map_items: - elems = list(self.g.objects(subject=doc_uri, predicate=attr[1])) + elems = list(self.graph.objects(subject=doc_uri, predicate=attr[1])) if attr[0] == "sections": doc_attrs[attr[0]] = [] - for s in elems: - doc_attrs[attr[0]].append(self.parse_section(s)) + for sec in elems: + doc_attrs[attr[0]].append(self.parse_section(sec)) elif attr[0] == "id": doc_attrs[attr[0]] = doc_uri.split("#", 1)[1] elif elems: @@ -280,15 +281,15 @@ def parse_section(self, sec_uri): rdf_sec = Section sec_attrs = {} for attr in rdf_sec.rdf_map_items: - elems = list(self.g.objects(subject=sec_uri, predicate=attr[1])) + elems = list(self.graph.objects(subject=sec_uri, predicate=attr[1])) if attr[0] == "sections": sec_attrs[attr[0]] = [] - for s in elems: - sec_attrs[attr[0]].append(self.parse_section(s)) + for sec in elems: + sec_attrs[attr[0]].append(self.parse_section(sec)) elif attr[0] == "properties": sec_attrs[attr[0]] = [] - for p in elems: - sec_attrs[attr[0]].append(self.parse_property(p)) + for prop in elems: + sec_attrs[attr[0]].append(self.parse_property(prop)) elif attr[0] == "id": sec_attrs[attr[0]] = sec_uri.split("#", 1)[1] elif elems: @@ -301,20 +302,20 @@ def parse_property(self, prop_uri): rdf_prop = Property prop_attrs = {} for attr in rdf_prop.rdf_map_items: - elems = list(self.g.objects(subject=prop_uri, predicate=attr[1])) + elems = list(self.graph.objects(subject=prop_uri, predicate=attr[1])) if attr[0] == "value" and elems: prop_attrs[attr[0]] = [] # rdflib does not respect order with RDF.li items yet, see comment above # support both RDF.li and rdf:_nnn for now. # Remove rdf:_nnn once rdflib respects RDF.li order in an RDF.Seq obj. - values = list(self.g.objects(subject=elems[0], predicate=RDF.li)) + values = list(self.graph.objects(subject=elems[0], predicate=RDF.li)) if values: - for v in values: - prop_attrs[attr[0]].append(v.toPython()) + for curr_val in values: + prop_attrs[attr[0]].append(curr_val.toPython()) else: # rdf:__nnn part - valseq = Seq(graph=self.g, subject=elems[0]) + valseq = Seq(graph=self.graph, subject=elems[0]) for seqitem in valseq: prop_attrs[attr[0]].append(seqitem.toPython()) diff --git a/test/test_rdf_writer.py b/test/test_rdf_writer.py index 7edcc88e..224ac205 100644 --- a/test/test_rdf_writer.py +++ b/test/test_rdf_writer.py @@ -27,42 +27,42 @@ def setUp(self): def test_convert_to_rdf(self): w = RDFWriter([self.doc, self.doc1]) w.convert_to_rdf() - doc_subjects = w.g.subjects(predicate=RDF.type, object=URIRef(odmlns.Document)) + doc_subjects = w.graph.subjects(predicate=RDF.type, object=URIRef(odmlns.Document)) self.assertEqual(len(list(doc_subjects)), 2) def test_adding_doc_to_the_hub(self): w = RDFWriter([self.doc]) w.convert_to_rdf() - hub_hasDocument = w.g.objects(subject=w.hub_root, predicate=odmlns.hasDocument) + hub_hasDocument = w.graph.objects(subject=w.hub_root, predicate=odmlns.hasDocument) self.assertEqual(len(list(hub_hasDocument)), 1) def test_adding_repository(self): w = RDFWriter([self.doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.objects(subject=w.hub_root, predicate=odmlns.hasTerminology))), 0) - self.assertEqual(len(list(w.g.objects(subject=URIRef(odmlns + w.docs[0].id), predicate=odmlns.hasTerminology))), 0) + self.assertEqual(len(list(w.graph.objects(subject=w.hub_root, predicate=odmlns.hasTerminology))), 0) + self.assertEqual(len(list(w.graph.objects(subject=URIRef(odmlns + w.docs[0].id), predicate=odmlns.hasTerminology))), 0) url = "terminology_url" self.doc.repository = url w = RDFWriter([self.doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subjects(predicate=RDF.type, object=URIRef(url)))), 1) - self.assertEqual(len(list(w.g.objects(subject=w.hub_root, predicate=odmlns.hasTerminology))), 1) - self.assertEqual(len(list(w.g.objects(subject=URIRef(odmlns + w.docs[0].id), predicate=odmlns.hasTerminology))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.type, object=URIRef(url)))), 1) + self.assertEqual(len(list(w.graph.objects(subject=w.hub_root, predicate=odmlns.hasTerminology))), 1) + self.assertEqual(len(list(w.graph.objects(subject=URIRef(odmlns + w.docs[0].id), predicate=odmlns.hasTerminology))), 1) def test_adding_sections(self): doc = odml.Document() w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=odmlns.hasSection))), 0) + self.assertEqual(len(list(w.graph.subject_objects(predicate=odmlns.hasSection))), 0) w = RDFWriter([self.doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=odmlns.hasSection))), 9) + self.assertEqual(len(list(w.graph.subject_objects(predicate=odmlns.hasSection))), 9) w = RDFWriter([self.doc, self.doc1]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=odmlns.hasSection))), 18) + self.assertEqual(len(list(w.graph.subject_objects(predicate=odmlns.hasSection))), 18) def test_adding_properties(self): doc = parse(""" @@ -72,15 +72,15 @@ def test_adding_properties(self): """) w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=odmlns.hasProperty))), 0) + self.assertEqual(len(list(w.graph.subject_objects(predicate=odmlns.hasProperty))), 0) w = RDFWriter([self.doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=odmlns.hasProperty))), 12) + self.assertEqual(len(list(w.graph.subject_objects(predicate=odmlns.hasProperty))), 12) w = RDFWriter([self.doc, self.doc1]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=odmlns.hasProperty))), 24) + self.assertEqual(len(list(w.graph.subject_objects(predicate=odmlns.hasProperty))), 24) def test_adding_values(self): doc = parse(""" @@ -89,9 +89,9 @@ def test_adding_values(self): w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=RDF.li))), 0) + self.assertEqual(len(list(w.graph.subject_objects(predicate=RDF.li))), 0) self.assertEqual(len(list( - w.g.subject_objects(predicate=URIRef("%s_1" % str(RDF))))), 0) + w.graph.subject_objects(predicate=URIRef("%s_1" % str(RDF))))), 0) doc = parse(""" s1[t1] @@ -100,21 +100,21 @@ def test_adding_values(self): w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subjects(predicate=RDF.li, + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.li, object=Literal("val")))), 0) - self.assertEqual(len(list(w.g.subjects(predicate=URIRef("%s_1" % str(RDF)), + self.assertEqual(len(list(w.graph.subjects(predicate=URIRef("%s_1" % str(RDF)), object=Literal("val")))), 1) doc.sections[0].properties[0].append("val2") w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subject_objects(predicate=RDF.li))), 0) - self.assertEqual(len(list(w.g.subjects(predicate=RDF.li, object=Literal("val")))), 0) - self.assertEqual(len(list(w.g.subjects(predicate=RDF.li, object=Literal("val2")))), 0) + self.assertEqual(len(list(w.graph.subject_objects(predicate=RDF.li))), 0) + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.li, object=Literal("val")))), 0) + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.li, object=Literal("val2")))), 0) - self.assertEqual(len(list(w.g.subjects(predicate=URIRef("%s_1" % str(RDF)), + self.assertEqual(len(list(w.graph.subjects(predicate=URIRef("%s_1" % str(RDF)), object=Literal("val")))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=URIRef("%s_2" % str(RDF)), + self.assertEqual(len(list(w.graph.subjects(predicate=URIRef("%s_2" % str(RDF)), object=Literal("val2")))), 1) doc = parse(""" @@ -127,8 +127,8 @@ def test_adding_values(self): w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subjects(predicate=RDF.li, object=Literal("val")))), 0) - self.assertEqual(len(list(w.g.subjects(predicate=URIRef("%s_1" % str(RDF)), + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.li, object=Literal("val")))), 0) + self.assertEqual(len(list(w.graph.subjects(predicate=URIRef("%s_1" % str(RDF)), object=Literal("val")))), 3) def test_section_subclass(self): @@ -142,8 +142,8 @@ def test_section_subclass(self): doc.append(s) w = RDFWriter(doc) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subjects(predicate=RDF.type, object=URIRef(odmlns[subclass[subclass_key]])))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=RDF.type, object=URIRef(odmlns.Section)))), 0) + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.type, object=URIRef(odmlns[subclass[subclass_key]])))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=RDF.type, object=URIRef(odmlns.Section)))), 0) def test_adding_other_entities_properties(self): doc = parse(""" @@ -179,22 +179,22 @@ def test_adding_other_entities_properties(self): w = RDFWriter([doc]) w.convert_to_rdf() - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasDocVersion, object=Literal(version)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasDate, object=Literal(date, datatype=XSD.date)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasAuthor, object=Literal(author)))), 1) - - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasName, object=Literal("s1")))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasType, object=Literal("t1")))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasDefinition, object=Literal(s_def)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasReference, object=Literal(s_ref)))), 1) - - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasName, object=Literal(p_name)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasUnit, object=Literal(p_unit)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasDefinition, object=Literal(p_def)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasUncertainty, object=Literal(p_uncertainty)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasDtype, object=Literal(p_dtype)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasValueOrigin, object=Literal(p_value_origin)))), 1) - self.assertEqual(len(list(w.g.subjects(predicate=odmlns.hasReference, object=Literal(p_ref)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasDocVersion, object=Literal(version)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasDate, object=Literal(date, datatype=XSD.date)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasAuthor, object=Literal(author)))), 1) + + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasName, object=Literal("s1")))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasType, object=Literal("t1")))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasDefinition, object=Literal(s_def)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasReference, object=Literal(s_ref)))), 1) + + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasName, object=Literal(p_name)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasUnit, object=Literal(p_unit)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasDefinition, object=Literal(p_def)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasUncertainty, object=Literal(p_uncertainty)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasDtype, object=Literal(p_dtype)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasValueOrigin, object=Literal(p_value_origin)))), 1) + self.assertEqual(len(list(w.graph.subjects(predicate=odmlns.hasReference, object=Literal(p_ref)))), 1) def test_get_rdf_string(self): w = RDFWriter([self.doc1]) From 5b15212bc7d3b6cc5bd3c7494414ecd93f082e54 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 16:37:39 +0100 Subject: [PATCH 05/26] [tools/rdfconverter] Cleanup imports order --- odml/tools/rdf_converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index cbec2544..45f55972 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -1,19 +1,19 @@ import os import uuid -import yaml from io import StringIO from rdflib import Graph, Literal, URIRef from rdflib.graph import Seq from rdflib.namespace import XSD, RDF +import yaml import odml from ..doc import BaseDocument from ..format import Format, Document, Section, Property +from ..info import FORMAT_VERSION from .dict_parser import DictReader from .parser_utils import ParserException -from ..info import FORMAT_VERSION from .utils import RDFConversionFormats try: From c9a5fd0a2621090c441977ca79bbe1845621cc9f Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 16:43:39 +0100 Subject: [PATCH 06/26] [tools/rdfwriter] Sanitize document check --- odml/tools/rdf_converter.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 45f55972..66895470 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -62,7 +62,10 @@ def __init__(self, odml_documents): """ :param odml_documents: list of odml documents """ - self.docs = odml_documents if not isinstance(odml_documents, BaseDocument) else [odml_documents] + if not isinstance(odml_documents, list): + odml_documents = [odml_documents] + + self.docs = odml_documents self.hub_root = None self.graph = Graph() self.graph.bind("odml", ODML_NS) @@ -73,7 +76,9 @@ def convert_to_rdf(self): self.hub_root = URIRef(ODML_NS.Hub) if self.docs: for doc in self.docs: - self.save_element(doc) + if isinstance(doc, BaseDocument): + self.save_element(doc) + return self.graph def save_element(self, odml_elem, node=None): From b8d24c872e3dc3e92354936eb83f605bd26b3744 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 16:58:14 +0100 Subject: [PATCH 07/26] [tools/rdfwriter] Simplify conditionals --- odml/tools/rdf_converter.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 66895470..c5737382 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -90,6 +90,10 @@ def save_element(self, odml_elem, node=None): """ fmt = odml_elem.format() + is_doc = isinstance(fmt, Document.__class__) + is_sec = isinstance(fmt, Section.__class__) + is_prop = isinstance(fmt, Property.__class__) + if not node: curr_node = URIRef(ODML_NS + unicode(odml_elem.id)) else: @@ -103,7 +107,7 @@ def save_element(self, odml_elem, node=None): self.graph.add((curr_node, RDF.type, URIRef(fmt.rdf_type))) # adding doc to the hub - if isinstance(fmt, Document.__class__): + if is_doc: self.graph.add((self.hub_root, ODML_NS.hasDocument, curr_node)) # If available add the documents filename to the document node @@ -114,8 +118,7 @@ def save_element(self, odml_elem, node=None): for k in fmt.rdf_map_keys: if k == 'id': continue - elif (isinstance(fmt, Document.__class__) or - isinstance(fmt, Section.__class__)) and k == "repository": + elif (is_doc or is_sec) and k == "repository": terminology_url = getattr(odml_elem, k) if terminology_url is None or not terminology_url: continue @@ -129,23 +132,19 @@ def save_element(self, odml_elem, node=None): self.graph.add((self.hub_root, ODML_NS.hasTerminology, node)) self.graph.add((curr_node, fmt.rdf_map(k), node)) # generating nodes for entities: sections, properties and bags of values - elif (isinstance(fmt, Document.__class__) or - isinstance(fmt, Section.__class__)) and \ - k == 'sections' and getattr(odml_elem, k): + elif (is_doc or is_sec) and k == 'sections' and getattr(odml_elem, k): sections = getattr(odml_elem, k) for curr_sec in sections: node = URIRef(ODML_NS + unicode(curr_sec.id)) self.graph.add((curr_node, fmt.rdf_map(k), node)) self.save_element(curr_sec, node) - elif isinstance(fmt, Section.__class__) and \ - k == 'properties' and getattr(odml_elem, k): + elif is_sec and k == 'properties' and getattr(odml_elem, k): properties = getattr(odml_elem, k) for curr_prop in properties: node = URIRef(ODML_NS + unicode(curr_prop.id)) self.graph.add((curr_node, fmt.rdf_map(k), node)) self.save_element(curr_prop, node) - elif isinstance(fmt, Property.__class__) and \ - k == 'value' and getattr(odml_elem, fmt.map(k)): + elif is_prop and k == 'value' and getattr(odml_elem, fmt.map(k)): # "value" needs to be mapped to its appropriate # Property library attribute. values = getattr(odml_elem, fmt.map(k)) @@ -191,8 +190,8 @@ def _get_section_subclass(self, elem): sec_type = getattr(elem, "type") if sec_type and sec_type in self.section_subclasses: return ODML_NS[self.section_subclasses[sec_type]] - else: - return None + + return None def __str__(self): return self.convert_to_rdf().serialize(format='turtle').decode("utf-8") From 02f16789ad2b1b6b86d9000fa010384d9dbec180 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 17:14:59 +0100 Subject: [PATCH 08/26] [tools/rdfconverter] Cleanup line lengths --- odml/tools/rdf_converter.py | 41 +++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index c5737382..2a631ef4 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -113,7 +113,8 @@ def save_element(self, odml_elem, node=None): # If available add the documents filename to the document node # so we can identify where the data came from. if hasattr(odml_elem, "_origin_file_name"): - self.graph.add((curr_node, ODML_NS.hasFileName, Literal(odml_elem._origin_file_name))) + curr_lit = Literal(odml_elem._origin_file_name) + self.graph.add((curr_node, ODML_NS.hasFileName, curr_lit)) for k in fmt.rdf_map_keys: if k == 'id': @@ -152,10 +153,9 @@ def save_element(self, odml_elem, node=None): self.graph.add((seq, RDF.type, RDF.Seq)) self.graph.add((curr_node, fmt.rdf_map(k), seq)) - # rdflib so far does not respect RDF:li item order - # in RDF:Seq on loading so we have to use custom - # numbered Node elements for now. Once rdflib upgrades - # this should be reversed to RDF:li again! + # rdflib so far does not respect RDF:li item order in RDF:Seq on + # loading so we have to use custom numbered Node elements for now. + # Once rdflib upgrades this should be reversed to RDF:li again! # see https://github.com/RDFLib/rdflib/issues/280 # -- keep until supported # bag = URIRef(ODML_NS + unicode(uuid.uuid4())) @@ -175,7 +175,8 @@ def save_element(self, odml_elem, node=None): if val is None or not val: continue elif k == 'date': - self.graph.add((curr_node, fmt.rdf_map(k), Literal(val, datatype=XSD.date))) + curr_lit = Literal(val, datatype=XSD.date) + self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) else: self.graph.add((curr_node, fmt.rdf_map(k), Literal(val))) return self.graph @@ -185,7 +186,8 @@ def _get_terminology_by_value(self, url): def _get_section_subclass(self, elem): """ - :return: RDF identifier of section subclass type if present in section_subclasses dict + :return: RDF identifier of section subclass type if present + in section_subclasses dict. """ sec_type = getattr(elem, "type") if sec_type and sec_type in self.section_subclasses: @@ -202,15 +204,17 @@ def __unicode__(self): def get_rdf_str(self, rdf_format="turtle"): """ Get converted into one of the supported formats data - :param rdf_format: possible formats: 'xml', 'n3', 'turtle', - 'nt', 'pretty-xml', 'trix', - 'trig', 'nquads', 'json-ld'. + + :param rdf_format: possible formats: 'xml', 'n3', 'turtle', 'nt', 'pretty-xml', + 'trix', 'trig', 'nquads', 'json-ld'. Full lists see in utils.RDFConversionFormats :return: string object """ if rdf_format not in RDFConversionFormats: - raise ValueError("odml.RDFWriter.get_rdf_str: Format for output files is incorrect. " - "Please choose from the list: {}".format(list(RDFConversionFormats))) + msg = "odml.RDFWriter.get_rdf_str: Format for output files is incorrect." + msg = "%s Please choose from the list: %s" % (msg, list(RDFConversionFormats)) + raise ValueError(msg) + return self.convert_to_rdf().serialize(format=rdf_format).decode("utf-8") def write_file(self, filename, rdf_format="turtle"): @@ -319,9 +323,9 @@ def parse_property(self, prop_uri): prop_attrs[attr[0]].append(curr_val.toPython()) else: # rdf:__nnn part - valseq = Seq(graph=self.graph, subject=elems[0]) - for seqitem in valseq: - prop_attrs[attr[0]].append(seqitem.toPython()) + val_seq = Seq(graph=self.graph, subject=elems[0]) + for seq_item in val_seq: + prop_attrs[attr[0]].append(seq_item.toPython()) elif attr[0] == "id": prop_attrs[attr[0]] = prop_uri.split("#", 1)[1] @@ -333,7 +337,8 @@ def parse_property(self, prop_uri): def _check_mandatory_attrs(self, attrs): if "name" not in attrs: + msg = "Entity missing required 'name' attribute" if "id" in attrs: - raise ParserException("Entity with id: %s does not have required \"name\" attribute" % attrs["id"]) - else: - raise ParserException("Some entities does not have required \"name\" attribute") + msg = "%s id:'%s'" % (msg, attrs["id"]) + + raise ParserException(msg) From c5fd4508076b3f57b76827ab226f504901ce46c8 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 17:16:37 +0100 Subject: [PATCH 09/26] [tools/rdfconverter] Make method static --- odml/tools/rdf_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 2a631ef4..0192ef23 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -335,7 +335,8 @@ def parse_property(self, prop_uri): self._check_mandatory_attrs(prop_attrs) return prop_attrs - def _check_mandatory_attrs(self, attrs): + @staticmethod + def _check_mandatory_attrs(attrs): if "name" not in attrs: msg = "Entity missing required 'name' attribute" if "id" in attrs: From 5470f1336851db038fee2d189d6275da407b227b Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 17:48:39 +0100 Subject: [PATCH 10/26] [tools/rdfreader] Simplify for loops --- odml/tools/rdf_converter.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 0192ef23..aeef8f5f 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -269,9 +269,8 @@ def from_string(self, file, doc_format): # TODO check mandatory attrs def parse_document(self, doc_uri): - rdf_doc = Document doc_attrs = {} - for attr in rdf_doc.rdf_map_items: + for attr in Document.rdf_map_items: elems = list(self.graph.objects(subject=doc_uri, predicate=attr[1])) if attr[0] == "sections": doc_attrs[attr[0]] = [] @@ -286,9 +285,8 @@ def parse_document(self, doc_uri): # TODO section subclass conversion def parse_section(self, sec_uri): - rdf_sec = Section sec_attrs = {} - for attr in rdf_sec.rdf_map_items: + for attr in Section.rdf_map_items: elems = list(self.graph.objects(subject=sec_uri, predicate=attr[1])) if attr[0] == "sections": sec_attrs[attr[0]] = [] @@ -307,9 +305,8 @@ def parse_section(self, sec_uri): return sec_attrs def parse_property(self, prop_uri): - rdf_prop = Property prop_attrs = {} - for attr in rdf_prop.rdf_map_items: + for attr in Property.rdf_map_items: elems = list(self.graph.objects(subject=prop_uri, predicate=attr[1])) if attr[0] == "value" and elems: prop_attrs[attr[0]] = [] From 971243449baf0f77962f793a19f005a6dc5586a4 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 18:31:50 +0100 Subject: [PATCH 11/26] [tools/rdfwriter] Add save_repo_node method --- odml/tools/rdf_converter.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index aeef8f5f..58d72e0b 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -81,6 +81,28 @@ def convert_to_rdf(self): return self.graph + def save_repository_node(self, curr_node, rdf_predicate, leaf_value): + """ + Save repository adds a node with a given repository url to the + current graphs terminology node. If the current graph does not + yet contain a terminology node, it creates one and attaches + the current node to it. + + :param curr_node: current parent node in the RDF graph. + :param rdf_predicate: RDF predicate that us used to add the terminology. + to the current node. + :param leaf_value: Value that will be added to the RDF graph. + """ + terminology_node = self.graph.value(predicate=RDF.type, object=URIRef(leaf_value)) + if not terminology_node: + # adding terminology url value to the graph and linking it + # to the current RDF node. + terminology_node = URIRef(ODML_NS + unicode(uuid.uuid4())) + self.graph.add((terminology_node, RDF.type, URIRef(leaf_value))) + self.graph.add((self.hub_root, ODML_NS.hasTerminology, terminology_node)) + + self.graph.add((curr_node, rdf_predicate, terminology_node)) + def save_element(self, odml_elem, node=None): """ Save the current element to the RDF graph From c5cdd7445feb3cd3aaef831ec867cf45d5a89d28 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 18:32:47 +0100 Subject: [PATCH 12/26] [tools/rdfwriter] Use save_repo_node / cleanup --- odml/tools/rdf_converter.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 58d72e0b..5988183b 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -141,19 +141,10 @@ def save_element(self, odml_elem, node=None): for k in fmt.rdf_map_keys: if k == 'id': continue - elif (is_doc or is_sec) and k == "repository": - terminology_url = getattr(odml_elem, k) - if terminology_url is None or not terminology_url: - continue - terminology_node = self._get_terminology_by_value(terminology_url) - if terminology_node: - self.graph.add((curr_node, fmt.rdf_map(k), terminology_node)) - else: - # adding terminology to the hub and to link with the doc - node = URIRef(ODML_NS + unicode(uuid.uuid4())) - self.graph.add((node, RDF.type, URIRef(terminology_url))) - self.graph.add((self.hub_root, ODML_NS.hasTerminology, node)) - self.graph.add((curr_node, fmt.rdf_map(k), node)) + elif (is_doc or is_sec) and k == "repository" and getattr(odml_elem, k): + self.save_repository_node(curr_node, fmt.rdf_map(k), + getattr(odml_elem, k)) + # generating nodes for entities: sections, properties and bags of values elif (is_doc or is_sec) and k == 'sections' and getattr(odml_elem, k): sections = getattr(odml_elem, k) @@ -203,9 +194,6 @@ def save_element(self, odml_elem, node=None): self.graph.add((curr_node, fmt.rdf_map(k), Literal(val))) return self.graph - def _get_terminology_by_value(self, url): - return self.graph.value(predicate=RDF.type, object=URIRef(url)) - def _get_section_subclass(self, elem): """ :return: RDF identifier of section subclass type if present From 1ac592dff2a3807a582bd3a5100e8517e6b47cb1 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 18:44:24 +0100 Subject: [PATCH 13/26] [tools/rdfwriter] Add save_odml_list method --- odml/tools/rdf_converter.py | 45 ++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 5988183b..902c40d5 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -81,16 +81,31 @@ def convert_to_rdf(self): return self.graph - def save_repository_node(self, curr_node, rdf_predicate, leaf_value): + def save_odml_list(self, odml_list, parent_node, rdf_predicate): """ - Save repository adds a node with a given repository url to the - current graphs terminology node. If the current graph does not - yet contain a terminology node, it creates one and attaches + save_odml_list adds all odml elements in a list to the current + parent node and handles all child items via save_element. + + :param odml_list: list of odml entities. + :param parent_node: current parent node in the RDF graph. + :param rdf_predicate: RDF predicate used to add all odml entities + to the parent node. + """ + for curr_item in odml_list: + node = URIRef(ODML_NS + unicode(curr_item.id)) + self.graph.add((parent_node, rdf_predicate, node)) + self.save_element(curr_item, node) + + def save_repository_node(self, parent_node, rdf_predicate, leaf_value): + """ + save_repository_node adds a node with a given repository url to + the current graphs terminology node. If the current graph does + not yet contain a terminology node, it creates one and attaches the current node to it. - :param curr_node: current parent node in the RDF graph. - :param rdf_predicate: RDF predicate that us used to add the terminology. - to the current node. + :param parent_node: current parent node in the RDF graph. + :param rdf_predicate: RDF predicate used to add the terminology + to the parent node. :param leaf_value: Value that will be added to the RDF graph. """ terminology_node = self.graph.value(predicate=RDF.type, object=URIRef(leaf_value)) @@ -101,7 +116,7 @@ def save_repository_node(self, curr_node, rdf_predicate, leaf_value): self.graph.add((terminology_node, RDF.type, URIRef(leaf_value))) self.graph.add((self.hub_root, ODML_NS.hasTerminology, terminology_node)) - self.graph.add((curr_node, rdf_predicate, terminology_node)) + self.graph.add((parent_node, rdf_predicate, terminology_node)) def save_element(self, odml_elem, node=None): """ @@ -147,17 +162,11 @@ def save_element(self, odml_elem, node=None): # generating nodes for entities: sections, properties and bags of values elif (is_doc or is_sec) and k == 'sections' and getattr(odml_elem, k): - sections = getattr(odml_elem, k) - for curr_sec in sections: - node = URIRef(ODML_NS + unicode(curr_sec.id)) - self.graph.add((curr_node, fmt.rdf_map(k), node)) - self.save_element(curr_sec, node) + self.save_odml_list(getattr(odml_elem, k), curr_node, fmt.rdf_map(k)) + elif is_sec and k == 'properties' and getattr(odml_elem, k): - properties = getattr(odml_elem, k) - for curr_prop in properties: - node = URIRef(ODML_NS + unicode(curr_prop.id)) - self.graph.add((curr_node, fmt.rdf_map(k), node)) - self.save_element(curr_prop, node) + self.save_odml_list(getattr(odml_elem, k), curr_node, fmt.rdf_map(k)) + elif is_prop and k == 'value' and getattr(odml_elem, fmt.map(k)): # "value" needs to be mapped to its appropriate # Property library attribute. From 29b1bc75232d43d2e3d27b5757717e7150faa7e7 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 18:54:34 +0100 Subject: [PATCH 14/26] [tools/rdfwriter] Add save_odml_values method --- odml/tools/rdf_converter.py | 55 ++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 902c40d5..40f77763 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -81,6 +81,36 @@ def convert_to_rdf(self): return self.graph + def save_odml_values(self, parent_node, rdf_predicate, values): + """ + save_odml_values adds an RDF seq node to the parent RDF node + and creates a value leaf node for every odml value. + + :param parent_node: current parent node in the RDF graph. + :param rdf_predicate: RDF predicate used to add the Seq node + to the current parent node. + :param values: list of odml values. + """ + seq = URIRef(ODML_NS + unicode(uuid.uuid4())) + self.graph.add((seq, RDF.type, RDF.Seq)) + self.graph.add((parent_node, rdf_predicate, seq)) + + # rdflib so far does not respect RDF:li item order in RDF:Seq on + # loading so we have to use custom numbered Node elements for now. + # Once rdflib upgrades this should be reversed to RDF:li again! + # see https://github.com/RDFLib/rdflib/issues/280 + # -- keep until supported + # bag = URIRef(ODML_NS + unicode(uuid.uuid4())) + # self.graph.add((bag, RDF.type, RDF.Bag)) + # self.graph.add((curr_node, fmt.rdf_map(k), bag)) + # for curr_val in values: + # self.graph.add((bag, RDF.li, Literal(curr_val))) + counter = 1 + for curr_val in values: + custom_predicate = "%s_%s" % (unicode(RDF), counter) + self.graph.add((seq, URIRef(custom_predicate), Literal(curr_val))) + counter = counter + 1 + def save_odml_list(self, odml_list, parent_node, rdf_predicate): """ save_odml_list adds all odml elements in a list to the current @@ -168,28 +198,9 @@ def save_element(self, odml_elem, node=None): self.save_odml_list(getattr(odml_elem, k), curr_node, fmt.rdf_map(k)) elif is_prop and k == 'value' and getattr(odml_elem, fmt.map(k)): - # "value" needs to be mapped to its appropriate - # Property library attribute. - values = getattr(odml_elem, fmt.map(k)) - seq = URIRef(ODML_NS + unicode(uuid.uuid4())) - self.graph.add((seq, RDF.type, RDF.Seq)) - self.graph.add((curr_node, fmt.rdf_map(k), seq)) - - # rdflib so far does not respect RDF:li item order in RDF:Seq on - # loading so we have to use custom numbered Node elements for now. - # Once rdflib upgrades this should be reversed to RDF:li again! - # see https://github.com/RDFLib/rdflib/issues/280 - # -- keep until supported - # bag = URIRef(ODML_NS + unicode(uuid.uuid4())) - # self.graph.add((bag, RDF.type, RDF.Bag)) - # self.graph.add((curr_node, fmt.rdf_map(k), bag)) - # for curr_val in values: - # self.graph.add((bag, RDF.li, Literal(curr_val))) - counter = 1 - for curr_val in values: - pred = "%s_%s" % (unicode(RDF), counter) - self.graph.add((seq, URIRef(pred), Literal(curr_val))) - counter = counter + 1 + # 'value' needs to be mapped to its appropriate odml Property attribute. + self.save_odml_values(curr_node, fmt.rdf_map(k), + getattr(odml_elem, fmt.map(k))) # adding entities' properties else: From 9a615c049c2b568cd7afe79a55c5cb1e08dbd1a4 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 19:28:26 +0100 Subject: [PATCH 15/26] [tools/rdfwriter] Refactor save element method --- odml/tools/rdf_converter.py | 72 ++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 40f77763..0737d8f6 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -150,69 +150,69 @@ def save_repository_node(self, parent_node, rdf_predicate, leaf_value): def save_element(self, odml_elem, node=None): """ - Save the current element to the RDF graph - :param odml_elem: current element - :param node: A node to pass the earlier created node to inner elements - :return: the RDF graph + Save the current odml element to the RDF graph and handle all child + elements of the current odml element recursively. + + :param odml_elem: An odml element that should be added to the RDF graph. + :param node: An RDF node that is used to append the current odml element + to the RDF graph. If None, a new node will be created and + added to the 'Hub' node of the RDF graph. """ fmt = odml_elem.format() - is_doc = isinstance(fmt, Document.__class__) - is_sec = isinstance(fmt, Section.__class__) - is_prop = isinstance(fmt, Property.__class__) + is_doc = fmt.name == Document.name + is_sec = fmt.name == Section.name + is_prop = fmt.name == Property.name - if not node: + curr_node = node + if not curr_node: curr_node = URIRef(ODML_NS + unicode(odml_elem.id)) - else: - curr_node = node - if fmt.name == "section": + # Add type of current node to the RDF graph + curr_type = fmt.rdf_type + # Handle section subclass types + if is_sec: sub_sec = self._get_section_subclass(odml_elem) - sec_type = sub_sec if sub_sec else fmt.rdf_type - self.graph.add((curr_node, RDF.type, URIRef(sec_type))) - else: - self.graph.add((curr_node, RDF.type, URIRef(fmt.rdf_type))) + if sub_sec: + curr_type = sub_sec + self.graph.add((curr_node, RDF.type, URIRef(curr_type))) - # adding doc to the hub + # Add a new document to the RDF Hub node if is_doc: self.graph.add((self.hub_root, ODML_NS.hasDocument, curr_node)) - # If available add the documents filename to the document node + # If available, add the documents' filename to the document node # so we can identify where the data came from. if hasattr(odml_elem, "_origin_file_name"): curr_lit = Literal(odml_elem._origin_file_name) self.graph.add((curr_node, ODML_NS.hasFileName, curr_lit)) for k in fmt.rdf_map_keys: - if k == 'id': + if k == "id" or \ + (k == "value" and not getattr(odml_elem, fmt.map(k))) or \ + (not getattr(odml_elem, k)): continue - elif (is_doc or is_sec) and k == "repository" and getattr(odml_elem, k): + + if (is_doc or is_sec) and k == "repository": self.save_repository_node(curr_node, fmt.rdf_map(k), getattr(odml_elem, k)) - # generating nodes for entities: sections, properties and bags of values - elif (is_doc or is_sec) and k == 'sections' and getattr(odml_elem, k): - self.save_odml_list(getattr(odml_elem, k), curr_node, fmt.rdf_map(k)) - - elif is_sec and k == 'properties' and getattr(odml_elem, k): + # generating nodes for sections and properties + elif ((is_doc or is_sec) and k == "sections") or \ + (is_sec and k == "properties"): self.save_odml_list(getattr(odml_elem, k), curr_node, fmt.rdf_map(k)) - elif is_prop and k == 'value' and getattr(odml_elem, fmt.map(k)): + # generating nodes for Property values + elif is_prop and k == "value": # 'value' needs to be mapped to its appropriate odml Property attribute. self.save_odml_values(curr_node, fmt.rdf_map(k), getattr(odml_elem, fmt.map(k))) - - # adding entities' properties + elif k == "date": + curr_lit = Literal(getattr(odml_elem, k), datatype=XSD.date) + self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) else: - val = getattr(odml_elem, k) - if val is None or not val: - continue - elif k == 'date': - curr_lit = Literal(val, datatype=XSD.date) - self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) - else: - self.graph.add((curr_node, fmt.rdf_map(k), Literal(val))) - return self.graph + curr_lit = Literal(getattr(odml_elem, k)) + self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) def _get_section_subclass(self, elem): """ From 1408dcf1761c8a9c2155f5c9dc2654bb6a4e701e Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 19:50:59 +0100 Subject: [PATCH 16/26] [tools/rdfwriter] Fix deprecated property use --- odml/tools/rdf_converter.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 0737d8f6..457aa779 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -7,11 +7,11 @@ from rdflib.namespace import XSD, RDF import yaml -import odml from ..doc import BaseDocument from ..format import Format, Document, Section, Property from ..info import FORMAT_VERSION +from ..resources import RDF_SUBCLASS_FILE from .dict_parser import DictReader from .parser_utils import ParserException from .utils import RDFConversionFormats @@ -34,13 +34,11 @@ def load_rdf_subclasses(): """ section_subclasses = {} - subclass_file = os.path.join(odml.__path__[0], 'resources', 'section_subclasses.yaml') - - if not os.path.isfile(subclass_file): - print("[Warning] Could not find subclass file '%s'" % subclass_file) + if not os.path.isfile(RDF_SUBCLASS_FILE): + print("[Warning] Could not find subclass file '%s'" % RDF_SUBCLASS_FILE) return section_subclasses - with open(subclass_file, "r") as yaml_file: + with open(RDF_SUBCLASS_FILE, "r") as yaml_file: try: section_subclasses = yaml.load(yaml_file) except yaml.parser.ParserError as err: @@ -188,9 +186,11 @@ def save_element(self, odml_elem, node=None): self.graph.add((curr_node, ODML_NS.hasFileName, curr_lit)) for k in fmt.rdf_map_keys: - if k == "id" or \ - (k == "value" and not getattr(odml_elem, fmt.map(k))) or \ - (not getattr(odml_elem, k)): + # Ignore "id" and empty values, but make sure the content of "value" + # is only accessed via its non deprecated property "values". + if k == "id" or k == "value" and not getattr(odml_elem, fmt.map(k)): + continue + elif k != "value" and not getattr(odml_elem, k): continue if (is_doc or is_sec) and k == "repository": @@ -207,9 +207,11 @@ def save_element(self, odml_elem, node=None): # 'value' needs to be mapped to its appropriate odml Property attribute. self.save_odml_values(curr_node, fmt.rdf_map(k), getattr(odml_elem, fmt.map(k))) + elif k == "date": curr_lit = Literal(getattr(odml_elem, k), datatype=XSD.date) self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) + else: curr_lit = Literal(getattr(odml_elem, k)) self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) From d4621551aa9a61340eef7ff8405843fc0faafccb Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 20:44:31 +0100 Subject: [PATCH 17/26] [info] Install path as constant --- odml/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/odml/info.py b/odml/info.py index 210d06b1..73bfc061 100644 --- a/odml/info.py +++ b/odml/info.py @@ -1,9 +1,9 @@ import os import json -here = os.path.dirname(__file__) +INSTALL_PATH = os.path.dirname(__file__) -with open(os.path.join(here, "info.json")) as infofile: +with open(os.path.join(INSTALL_PATH, "info.json")) as infofile: infodict = json.load(infofile) VERSION = infodict["VERSION"] From bb773266c41c7c372ee92b6bbcddab647baa48c5 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 20:44:45 +0100 Subject: [PATCH 18/26] [tools/rdfconverter] Use install path --- odml/tools/rdf_converter.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 457aa779..ed3a1dd9 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -10,8 +10,7 @@ from ..doc import BaseDocument from ..format import Format, Document, Section, Property -from ..info import FORMAT_VERSION -from ..resources import RDF_SUBCLASS_FILE +from ..info import FORMAT_VERSION, INSTALL_PATH from .dict_parser import DictReader from .parser_utils import ParserException from .utils import RDFConversionFormats @@ -34,11 +33,13 @@ def load_rdf_subclasses(): """ section_subclasses = {} - if not os.path.isfile(RDF_SUBCLASS_FILE): - print("[Warning] Could not find subclass file '%s'" % RDF_SUBCLASS_FILE) + subclass_file = os.path.join(INSTALL_PATH, "resources", "section_subclasses.yaml") + + if not os.path.isfile(subclass_file): + print("[Warning] Could not find subclass file '%s'" % subclass_file) return section_subclasses - with open(RDF_SUBCLASS_FILE, "r") as yaml_file: + with open(subclass_file, "r") as yaml_file: try: section_subclasses = yaml.load(yaml_file) except yaml.parser.ParserError as err: From 4df36af844071871983df6e3d00d0c6af0930efb Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 4 Nov 2019 21:08:13 +0100 Subject: [PATCH 19/26] [tools/rdfconverter] Add/update docstrings --- odml/tools/rdf_converter.py | 96 ++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 11 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index ed3a1dd9..e6b30826 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -1,3 +1,8 @@ +""" +The RDF converter module provides conversion of odML documents to RDF and +the conversion of odML flavored RDF to odML documents. +""" + import os import uuid @@ -59,7 +64,7 @@ class RDFWriter(object): def __init__(self, odml_documents): """ - :param odml_documents: list of odml documents + :param odml_documents: list of odML documents """ if not isinstance(odml_documents, list): odml_documents = [odml_documents] @@ -72,6 +77,13 @@ def __init__(self, odml_documents): self.section_subclasses = load_rdf_subclasses() def convert_to_rdf(self): + """ + convert_to_rdf converts all odML documents to RDF, + connects them via a common "Hub" RDF node and + returns the created RDF graph. + + :return: An RDF graph. + """ self.hub_root = URIRef(ODML_NS.Hub) if self.docs: for doc in self.docs: @@ -219,8 +231,12 @@ def save_element(self, odml_elem, node=None): def _get_section_subclass(self, elem): """ + _get_section_subclass checks whether the current odML element + is of a type that can be converted into an RDF subclass of + class Section. + :return: RDF identifier of section subclass type if present - in section_subclasses dict. + in the section_subclasses dict. """ sec_type = getattr(elem, "type") if sec_type and sec_type in self.section_subclasses: @@ -236,11 +252,13 @@ def __unicode__(self): def get_rdf_str(self, rdf_format="turtle"): """ - Get converted into one of the supported formats data + Convert the current odML content of the parser to a common RDF graph + and return the graph as a string object in the specified RDF format. :param rdf_format: possible formats: 'xml', 'n3', 'turtle', 'nt', 'pretty-xml', 'trix', 'trig', 'nquads', 'json-ld'. Full lists see in utils.RDFConversionFormats + :return: string object """ if rdf_format not in RDFConversionFormats: @@ -251,6 +269,16 @@ def get_rdf_str(self, rdf_format="turtle"): return self.convert_to_rdf().serialize(format=rdf_format).decode("utf-8") def write_file(self, filename, rdf_format="turtle"): + """ + Convert the current odML content of the parser to a common RDF graph + and write the resulting graph to an output file using the provided + RDF output format. + + :param filename: + :param rdf_format: Possible RDF output format. See utils.RDFConversionFormats + for a full list of supported formats. + Default format is 'turtle'. + """ data = self.get_rdf_str(rdf_format) filename_ext = filename if filename.find(RDFConversionFormats.get(rdf_format)) < 0: @@ -262,7 +290,7 @@ def write_file(self, filename, rdf_format="turtle"): class RDFReader(object): """ - A reader to parse odML RDF files or strings into odml documents. + A reader to parse odML RDF files or strings into odML documents. Usage: file = RDFReader().from_file("/path_to_input_rdf", "rdf_format") @@ -271,13 +299,20 @@ class RDFReader(object): """ def __init__(self, filename=None, doc_format=None): + """ + :param filename: Path of the input odML RDF file. + :param doc_format: RDF format of the input odML RDF file. + """ self.docs = [] # list of parsed odml docs if filename and doc_format: self.graph = Graph().parse(source=filename, format=doc_format) def to_odml(self): """ - :return: list of converter odml documents + to_odml converts all odML documents from a common RDF graph + into individual odML documents. + + :return: list of converted odML documents """ docs_uris = list(self.graph.objects(subject=URIRef(ODML_NS.Hub), predicate=ODML_NS.hasDocument)) @@ -289,19 +324,41 @@ def to_odml(self): return self.docs def from_file(self, filename, doc_format): + """ + from_file loads an odML RDF file and converts all odML documents + from this RDF graph into individual odML documents. + + :param filename: Path of the input odML RDF file. + :param doc_format: RDF format of the input odML RDF file. + :return: list of converted odML documents + """ self.graph = Graph().parse(source=filename, format=doc_format) docs = self.to_odml() for curr_doc in docs: # Provide original file name via the document curr_doc._origin_file_name = os.path.basename(filename) + return docs def from_string(self, file, doc_format): + """ + from_string loads an odML RDF file or string object and converts all + odML documents from this RDF graph into individual odML documents. + + :param file: Path of the input odML RDF file or an RDF graph string object. + :param doc_format: RDF format of the input odML RDF graph. + :return: list of converted odML documents + """ self.graph = Graph().parse(source=StringIO(file), format=doc_format) return self.to_odml() - # TODO check mandatory attrs def parse_document(self, doc_uri): + """ + parse_document parses an odML RDF Document node into an odML Document. + + :param doc_uri: RDF URI of an odML Document node within an RDF graph. + :return: dict containing an odML Document + """ doc_attrs = {} for attr in Document.rdf_map_items: elems = list(self.graph.objects(subject=doc_uri, predicate=attr[1])) @@ -316,8 +373,13 @@ def parse_document(self, doc_uri): return {'Document': doc_attrs, 'odml-version': FORMAT_VERSION} - # TODO section subclass conversion def parse_section(self, sec_uri): + """ + parse_section parses an odML RDF Section node into an odML Section. + + :param sec_uri: RDF URI of an odML Section node within an RDF graph. + :return: dict containing an odML Section + """ sec_attrs = {} for attr in Section.rdf_map_items: elems = list(self.graph.objects(subject=sec_uri, predicate=attr[1])) @@ -338,6 +400,12 @@ def parse_section(self, sec_uri): return sec_attrs def parse_property(self, prop_uri): + """ + parse_property parses an odML RDF Property node into an odML Property. + + :param prop_uri: RDF URI of an odML Property node within an RDF graph. + :return: dict containing an odML Property + """ prop_attrs = {} for attr in Property.rdf_map_items: elems = list(self.graph.objects(subject=prop_uri, predicate=attr[1])) @@ -366,10 +434,16 @@ def parse_property(self, prop_uri): return prop_attrs @staticmethod - def _check_mandatory_attrs(attrs): - if "name" not in attrs: + def _check_mandatory_attrs(odml_entity): + """ + _check_mandatory_attrs checks whether a passed odML entity contains + the required "name" attribute and raises a ParserException otherwise. + + :param odml_entity: dict containing an odmL entity + """ + if "name" not in odml_entity: msg = "Entity missing required 'name' attribute" - if "id" in attrs: - msg = "%s id:'%s'" % (msg, attrs["id"]) + if "id" in odml_entity: + msg = "%s id:'%s'" % (msg, odml_entity["id"]) raise ParserException(msg) From 7ce94b3a2f78db553dabf3f94d1b16fb99e8bb18 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Mon, 11 Nov 2019 17:31:42 +0100 Subject: [PATCH 20/26] [tools/rdfconverter] Add finalize call Call `finalize` for every document to make sure all `links` and `includes` are resolved before exporting an odml document to rdf. --- odml/tools/rdf_converter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index e6b30826..4cc5aeab 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -88,6 +88,8 @@ def convert_to_rdf(self): if self.docs: for doc in self.docs: if isinstance(doc, BaseDocument): + # make sure links and includes are resolved before conversion + doc.finalize() self.save_element(doc) return self.graph From 52eb531a58aa7e2e3dec42990fdd1474c2284ac6 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Tue, 12 Nov 2019 13:30:10 +0100 Subject: [PATCH 21/26] [resources] Move odml ont file and export Moving the root-ontology.ttl file from doc/odml_ontology to odml/resources/odml-ontology.ttl and include it in packaging via Manifest.in. --- MANIFEST.in | 1 + .../root-ontology.ttl => odml/resources/odml-ontology.ttl | 0 2 files changed, 1 insertion(+) rename doc/odml_ontology/root-ontology.ttl => odml/resources/odml-ontology.ttl (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 056a4135..610ff1b7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,3 +2,4 @@ include LICENSE include README.rst include odml/info.json include odml/resources/section_subclasses.yaml +include odml/resources/odml-ontology.ttl diff --git a/doc/odml_ontology/root-ontology.ttl b/odml/resources/odml-ontology.ttl similarity index 100% rename from doc/odml_ontology/root-ontology.ttl rename to odml/resources/odml-ontology.ttl From 123982e7a8c524a2381218a7fc7efbf18cd5b010 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Wed, 13 Nov 2019 15:26:26 +0100 Subject: [PATCH 22/26] [tools/rdfwriter] Add save_doc method --- odml/tools/rdf_converter.py | 43 ++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 4cc5aeab..9bcbdf86 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -161,11 +161,52 @@ def save_repository_node(self, parent_node, rdf_predicate, leaf_value): self.graph.add((parent_node, rdf_predicate, terminology_node)) + def save_document(self, doc, curr_node=None): + """ + Add the current odML Document to the RDF graph and handle all child + elements recursively. + + :param doc: An odml Document that should be added to the RDF graph. + :param curr_node: An RDF node that is used to append the current odml element + to the Hub node of the current RDF graph. + """ + fmt = doc.format() + + if not curr_node: + curr_node = URIRef(ODML_NS + unicode(doc.id)) + + self.graph.add((curr_node, RDF.type, URIRef(fmt.rdf_type))) + self.graph.add((self.hub_root, ODML_NS.hasDocument, curr_node)) + + # If available, add the documents' filename to the document node + # so we can identify where the data came from. + if hasattr(doc, "_origin_file_name"): + curr_lit = Literal(doc._origin_file_name) + self.graph.add((curr_node, ODML_NS.hasFileName, curr_lit)) + + for k in fmt.rdf_map_keys: + curr_pred = fmt.rdf_map(k) + curr_val = getattr(doc, k) + + # Ignore an "id" entry, it has already been used to create the node itself. + if k == "id" or not curr_val: + continue + elif k == "repository": + self.save_repository_node(curr_node, curr_pred, curr_val) + elif k == "sections": + # generating nodes for child sections + self.save_odml_list(curr_val, curr_node, curr_pred) + elif k == "date": + curr_lit = Literal(curr_val, datatype=XSD.date) + self.graph.add((curr_node, curr_pred, curr_lit)) + else: + curr_lit = Literal(curr_val) + self.graph.add((curr_node, curr_pred, curr_lit)) + def save_element(self, odml_elem, node=None): """ Save the current odml element to the RDF graph and handle all child elements of the current odml element recursively. - :param odml_elem: An odml element that should be added to the RDF graph. :param node: An RDF node that is used to append the current odml element to the RDF graph. If None, a new node will be created and From 8172f9eab8c4b3e1595bf6d3add62e0c6ab97e53 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Wed, 13 Nov 2019 15:27:13 +0100 Subject: [PATCH 23/26] [tools/rdfwriter] Add save_sec method --- odml/tools/rdf_converter.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 9bcbdf86..e12efddb 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -203,6 +203,42 @@ def save_document(self, doc, curr_node=None): curr_lit = Literal(curr_val) self.graph.add((curr_node, curr_pred, curr_lit)) + def save_section(self, sec, curr_node): + """ + Add the current odML Section to the RDF graph and handle all child + elements recursively. + + :param sec: An odml Section that should be added to the RDF graph. + :param curr_node: An RDF node that is used to append the current odml element + to the current RDF graph. + """ + fmt = sec.format() + + # Add type of current node to the RDF graph + curr_type = fmt.rdf_type + # Handle section subclass types + sub_sec = self._get_section_subclass(sec) + if sub_sec: + curr_type = sub_sec + self.graph.add((curr_node, RDF.type, URIRef(curr_type))) + + for k in fmt.rdf_map_keys: + curr_pred = fmt.rdf_map(k) + curr_val = getattr(sec, k) + + # Ignore an "id" entry, it has already been used to create the node itself. + if k == "id" or not curr_val: + continue + elif k == "repository": + self.save_repository_node(curr_node, curr_pred, curr_val) + + # generating nodes for sections and properties + elif k in ["sections", "properties"]: + self.save_odml_list(curr_val, curr_node, curr_pred) + else: + curr_lit = Literal(curr_val) + self.graph.add((curr_node, curr_pred, curr_lit)) + def save_element(self, odml_elem, node=None): """ Save the current odml element to the RDF graph and handle all child From 7fa5dc328466573359fde7426a97dd8a074c1db6 Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Wed, 13 Nov 2019 15:27:52 +0100 Subject: [PATCH 24/26] [tools/rdfwriter] Add save_prop method --- odml/tools/rdf_converter.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index e12efddb..bb68cc4b 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -239,6 +239,39 @@ def save_section(self, sec, curr_node): curr_lit = Literal(curr_val) self.graph.add((curr_node, curr_pred, curr_lit)) + def save_property(self, prop, curr_node): + """ + Add the current odML Property to the RDF graph and handle all child + elements. + + :param prop: An odml Section that should be added to the RDF graph. + :param curr_node: An RDF node that is used to append the current odml element + to the current RDF graph. + """ + fmt = prop.format() + + self.graph.add((curr_node, RDF.type, URIRef(fmt.rdf_type))) + + for k in fmt.rdf_map_keys: + curr_pred = fmt.rdf_map(k) + # Make sure the content of "value" is only accessed via its + # non deprecated property "values". + if k == "value": + curr_val = getattr(prop, fmt.map(k)) + else: + curr_val = getattr(prop, k) + + # Ignore "id" and empty values, but make sure the content of "value" + # is only accessed via its non deprecated property "values". + if k == "id" or not curr_val: + continue + elif k == "value": + # generating nodes for Property values + self.save_odml_values(curr_node, curr_pred, curr_val) + else: + curr_lit = Literal(curr_val) + self.graph.add((curr_node, curr_pred, curr_lit)) + def save_element(self, odml_elem, node=None): """ Save the current odml element to the RDF graph and handle all child From 38429e4235ccdf41ced99ce583c6961e4d30bcad Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Wed, 13 Nov 2019 15:29:09 +0100 Subject: [PATCH 25/26] [tools/rdfwriter] Refactor save_elem usage To reduce complexity, the RDFWriter.save_elem method is removed and its usage replaced by the new methods RDFWriter.save_document, .save_section and .save_property. --- odml/tools/rdf_converter.py | 78 ++++--------------------------------- 1 file changed, 7 insertions(+), 71 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index bb68cc4b..8bb995e6 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -90,7 +90,7 @@ def convert_to_rdf(self): if isinstance(doc, BaseDocument): # make sure links and includes are resolved before conversion doc.finalize() - self.save_element(doc) + self.save_document(doc) return self.graph @@ -137,7 +137,12 @@ def save_odml_list(self, odml_list, parent_node, rdf_predicate): for curr_item in odml_list: node = URIRef(ODML_NS + unicode(curr_item.id)) self.graph.add((parent_node, rdf_predicate, node)) - self.save_element(curr_item, node) + + fmt = curr_item.format() + if fmt.name == Section.name: + self.save_section(curr_item, node) + elif fmt.name == Property.name: + self.save_property(curr_item, node) def save_repository_node(self, parent_node, rdf_predicate, leaf_value): """ @@ -272,75 +277,6 @@ def save_property(self, prop, curr_node): curr_lit = Literal(curr_val) self.graph.add((curr_node, curr_pred, curr_lit)) - def save_element(self, odml_elem, node=None): - """ - Save the current odml element to the RDF graph and handle all child - elements of the current odml element recursively. - :param odml_elem: An odml element that should be added to the RDF graph. - :param node: An RDF node that is used to append the current odml element - to the RDF graph. If None, a new node will be created and - added to the 'Hub' node of the RDF graph. - """ - fmt = odml_elem.format() - - is_doc = fmt.name == Document.name - is_sec = fmt.name == Section.name - is_prop = fmt.name == Property.name - - curr_node = node - if not curr_node: - curr_node = URIRef(ODML_NS + unicode(odml_elem.id)) - - # Add type of current node to the RDF graph - curr_type = fmt.rdf_type - # Handle section subclass types - if is_sec: - sub_sec = self._get_section_subclass(odml_elem) - if sub_sec: - curr_type = sub_sec - self.graph.add((curr_node, RDF.type, URIRef(curr_type))) - - # Add a new document to the RDF Hub node - if is_doc: - self.graph.add((self.hub_root, ODML_NS.hasDocument, curr_node)) - - # If available, add the documents' filename to the document node - # so we can identify where the data came from. - if hasattr(odml_elem, "_origin_file_name"): - curr_lit = Literal(odml_elem._origin_file_name) - self.graph.add((curr_node, ODML_NS.hasFileName, curr_lit)) - - for k in fmt.rdf_map_keys: - # Ignore "id" and empty values, but make sure the content of "value" - # is only accessed via its non deprecated property "values". - if k == "id" or k == "value" and not getattr(odml_elem, fmt.map(k)): - continue - elif k != "value" and not getattr(odml_elem, k): - continue - - if (is_doc or is_sec) and k == "repository": - self.save_repository_node(curr_node, fmt.rdf_map(k), - getattr(odml_elem, k)) - - # generating nodes for sections and properties - elif ((is_doc or is_sec) and k == "sections") or \ - (is_sec and k == "properties"): - self.save_odml_list(getattr(odml_elem, k), curr_node, fmt.rdf_map(k)) - - # generating nodes for Property values - elif is_prop and k == "value": - # 'value' needs to be mapped to its appropriate odml Property attribute. - self.save_odml_values(curr_node, fmt.rdf_map(k), - getattr(odml_elem, fmt.map(k))) - - elif k == "date": - curr_lit = Literal(getattr(odml_elem, k), datatype=XSD.date) - self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) - - else: - curr_lit = Literal(getattr(odml_elem, k)) - self.graph.add((curr_node, fmt.rdf_map(k), curr_lit)) - def _get_section_subclass(self, elem): """ _get_section_subclass checks whether the current odML element From 3ad35b417c542a8923f225a4654bab89209a235d Mon Sep 17 00:00:00 2001 From: "M. Sonntag" Date: Wed, 13 Nov 2019 15:32:31 +0100 Subject: [PATCH 26/26] [tools/rdfwriter] Change save_odml_list signature Changed the signature of RDFWriter.save_odml_list so it is identical to the signature of similar methods in the same class. --- odml/tools/rdf_converter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/odml/tools/rdf_converter.py b/odml/tools/rdf_converter.py index 8bb995e6..cf377db7 100644 --- a/odml/tools/rdf_converter.py +++ b/odml/tools/rdf_converter.py @@ -124,15 +124,15 @@ def save_odml_values(self, parent_node, rdf_predicate, values): self.graph.add((seq, URIRef(custom_predicate), Literal(curr_val))) counter = counter + 1 - def save_odml_list(self, odml_list, parent_node, rdf_predicate): + def save_odml_list(self, parent_node, rdf_predicate, odml_list): """ save_odml_list adds all odml elements in a list to the current parent node and handles all child items via save_element. - :param odml_list: list of odml entities. :param parent_node: current parent node in the RDF graph. :param rdf_predicate: RDF predicate used to add all odml entities to the parent node. + :param odml_list: list of odml entities. """ for curr_item in odml_list: node = URIRef(ODML_NS + unicode(curr_item.id)) @@ -200,7 +200,7 @@ def save_document(self, doc, curr_node=None): self.save_repository_node(curr_node, curr_pred, curr_val) elif k == "sections": # generating nodes for child sections - self.save_odml_list(curr_val, curr_node, curr_pred) + self.save_odml_list(curr_node, curr_pred, curr_val) elif k == "date": curr_lit = Literal(curr_val, datatype=XSD.date) self.graph.add((curr_node, curr_pred, curr_lit)) @@ -239,7 +239,7 @@ def save_section(self, sec, curr_node): # generating nodes for sections and properties elif k in ["sections", "properties"]: - self.save_odml_list(curr_val, curr_node, curr_pred) + self.save_odml_list(curr_node, curr_pred, curr_val) else: curr_lit = Literal(curr_val) self.graph.add((curr_node, curr_pred, curr_lit))