In [None]:
dataiku_env = True
try:
    import dataiku
except:
    dataiku_env = False

In [None]:
import rdflib
from os.path import join

In [None]:
if dataiku_env:
    handle = dataiku.Folder("taxonomies")
    file_path = handle.get_path() + "/processed/"
else:
    file_path = join('..', 'data', 'rdf', 'xbrl', 'taxonomies')

In [None]:
prefixes = """
    PREFIX xl: <http://www.xbrl.org/2003/xlink/>
    PREFIX xbrll: <https://w3id.org/vocab/xbrll/>
    PREFIX xlink: <http://www.w3.org/1999/xlink/>
"""

In [None]:
schema_file = open(join(file_path, 'schema.ttl'), encoding = 'utf-8')
rdf_schema = rdflib.Graph()
rdf_schema.parse(file=schema_file, format="turtle")
schema_file.close()

linkbase_file = open(join(file_path, 'linkbase.ttl'), encoding = 'utf-8')
rdf_linkbase = rdflib.Graph()
rdf_linkbase.parse(file=linkbase_file, format="turtle")
linkbase_file.close()

# Process concept-labels

In [None]:
combinations = rdf_linkbase.query(prefixes + """
    SELECT ?id ?locator ?resource ?link
    WHERE 
    {
        ?link ?pred xbrll:concept-label .
        ?link xl:from ?locator .
        ?link xl:to ?resource .
        ?locator rdf:id ?id .
        ?resource rdf:value ?label .
    }
""")

In [None]:
list(combinations)

In [None]:
to_add = []
to_remove = []
for combination in combinations:
    for subj, _, _ in rdf_schema.triples((None, rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#id'), combination[0])):

        for locator, pred, obj in rdf_linkbase.triples((combination[1], None, None)):
            if pred != rdflib.term.URIRef('http://www.xbrl.org/2003/XLink/type'):
                to_add.append((subj, pred, obj))
            to_remove.append((locator, pred, obj))

        for resource, pred, obj in rdf_linkbase.triples((combination[2], None, None)):
            if pred != rdflib.term.URIRef('http://www.w3.org/1999/xlink/role'):
                to_add.append((subj, pred, obj))
            to_remove.append((resource, pred, obj))

        for link, pred, obj in rdf_linkbase.triples((combination[3], None, None)):
            to_remove.append((link, pred, obj))

for triple in to_add:
    rdf_schema.add(triple)
for triple in to_remove:
    rdf_linkbase.remove(triple)

In [None]:
fg = open(join(file_path, 'linkbase_optimized.ttl'), "w", encoding = "utf-8")
fg.write(rdf_linkbase.serialize(format='turtle').decode('utf-8'))
fg.close()

In [None]:
fg = open(join(file_path, 'schema_optimized.ttl'), "w", encoding = "utf-8")
fg.write(rdf_schema.serialize(format='turtle').decode('utf-8'))
fg.close()