In [None]:
dataiku_env = True
try:
    import dataiku
except:
    dataiku_env = False

In [None]:
from lxml import etree
from os.path import join, isfile, exists as os_exists
from os import listdir
from src.data import process_instance
import rdflib

In [None]:
if dataiku_env:
    base_folder = dataiku.Folder("instances").get_path()
    data_folder = 'fake/'
    base_dir = base_folder + '/' + data_folder
    output_dir = base_dir + "/processed"
else:
    base_dir = join("..", "data", "external", "xbrl", "instances")
    output_dir = join("..", "data", "rdf", "xbrl","instances")
    
if not os_exists(output_dir):
    os.mkdir(output_dir)

In [None]:
rdf_namespaces = {"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                  "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
                  "xl": "http://www.xbrl.org/2003/XLink",
                  "xsd": "http://www.w3.org/2001/XMLSchema",
                  'xbrll': "https://w3id.org/vocab/xbrll",
                  "dnb": "http://dnb.nl/",
                  "gleif-l1": "https://www.gleif.org/ontology/L1"
}

In [None]:
xbrl_files = [f for f in listdir(base_dir ) if (isfile(join(base_dir , f))) and f[-4:].lower()=='xbrl']
for file in xbrl_files:
    print("processing: "+str(file))
    base = join(base_dir, file)
    with open(base, "rb") as fp:
        root = etree.fromstring(fp.read(), parser=etree.XMLParser(remove_comments=True))
    namespaces = dict((k.lower(), v.lower()) for k,v in root.nsmap.items())
    namespaces.update(rdf_namespaces)
    instance = process_instance.processInstance(root, file, namespaces)
    try:
        g = rdflib.Graph()
        g.parse(data=instance['output'].getvalue(), format="turtle")
        with open(join(output_dir, file[:-5]+'.ttl'), 'w', encoding = 'utf-8') as fp:
            fp.write(str(instance['output'].getvalue()))
        print("      written to: " +str(join(file[:-5] + '.ttl')))
    except Exception as e: 
        print(' ')
        print(e)
        print(' ')