In [1]:
dataiku_env = True
try:
    import dataiku
except:
    dataiku_env = False

In [2]:
from lxml import etree
from os.path import join, isfile, exists as os_exists
from os import listdir
from src.data import process_instance
import rdflib

In [3]:
if dataiku_env:
    base_folder = dataiku.Folder("instances").get_path()
    data_folder = 'real/'
    base_dir = base_folder + '/' + data_folder
    output_dir = base_dir + "/processed"
else:
    base_dir = join("..", "data", "external","instances")
    output_dir = join("..", "data", "raw","instances")
    
if not os_exists(output_dir):
    os.mkdir(output_dir)

In [8]:
rdf_namespaces = {"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                  "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
                  "xl": "http://www.xbrl.org/2003/XLink",
                  "xsd": "http://www.w3.org/2001/XMLSchema",
                  'xbrll': "https://w3id.org/vocab/xbrll",
                  "dnb": "http://dnb.nl/",
                  "gleif-l1": "https://www.gleif.org/ontology/L1"
}

In [9]:
g = rdflib.Graph()
xbrl_files = [f for f in listdir(base_dir ) if (isfile(join(base_dir , f))) and f[-4:].lower()=='xbrl']
for file in xbrl_files:
    print("processing: "+str(file))
    base = join(base_dir, file)
    with open(base, "rb") as fp:
        root = etree.fromstring(fp.read(), parser=etree.XMLParser(remove_comments=True))
    namespaces = {**root.nsmap, **rdf_namespaces}
    instance = process_instance.processInstance(root, file, namespaces)
    try:
        g.parse(data=instance['output'].getvalue(), format="turtle")
        with open(join(output_dir, file[:-5]+'.ttl'), 'w', encoding = 'utf-8') as fp:
            fp.write(str(instance['output'].getvalue()))
        print("      written to: " +str(join(file[:-5] + '.ttl')))
    except Exception as e: 
        print(' ')
        print(e)
        print(' ')

processing: Xbrl_Instance_000305_JE_CONCZ_ARG2_4_2603_10572282028095600444513.xbrl
      written to: Xbrl_Instance_000305_JE_CONCZ_ARG2_4_2603_10572282028095600444513.ttl
processing: Xbrl_Instance_603492_JE_NZV_ARS_2_4_1806_164516541110199214850284.xbrl
      written to: Xbrl_Instance_603492_JE_NZV_ARS_2_4_1806_164516541110199214850284.ttl
processing: Xbrl_Instance_DMI006_S2AG_2019Q4_ASR_0204_16519496432483166490628.xbrl
      written to: Xbrl_Instance_DMI006_S2AG_2019Q4_ASR_0204_16519496432483166490628.ttl
processing: Xbrl_Instance_DMI006_S2AS_219Q4_2523_3003_07452433859020140614697.xbrl
      written to: Xbrl_Instance_DMI006_S2AS_219Q4_2523_3003_07452433859020140614697.ttl
processing: Xbrl_Instance_DMI006_S2AS_219Q4_2528_2906_10295540553839551139114.xbrl
      written to: Xbrl_Instance_DMI006_S2AS_219Q4_2528_2906_10295540553839551139114.ttl
processing: Xbrl_Instance_DMI006_S2QS_219Q4_2523_0402_15364061702482115404508.xbrl
      written to: Xbrl_Instance_DMI006_S2QS_219Q4_2523_0402_15