In [None]:
from os import listdir
from os.path import join, isfile
from lxml import etree
from src.data import PackageManager
from src.data import XbrlConst
from collections import defaultdict
from src.data import xbrl2rdf, process_linkbase, process_instance, process_schema
import src.data
from io import StringIO, BytesIO
import rdflib

In [None]:
XBRL_TAXONOMY_PATH = join('..', 'data', 'external', 'taxonomies')
output_dir = join("..", "data", "raw")

taxonomies = [f for f in listdir(XBRL_TAXONOMY_PATH) if isfile(join(XBRL_TAXONOMY_PATH, f)) and f[-3:]=='zip']

manager = PackageManager.Taxonomies(XBRL_TAXONOMY_PATH)

for taxonomy in taxonomies:
    manager.addPackage(join(XBRL_TAXONOMY_PATH, taxonomy))
manager.rebuildRemappings()
manager.save()
for item in manager.config['packages']:
    print(item['URL'])

In [None]:
f = src.data.openFileSource(manager.config['packages'][2]['URL'])
f.mappedPaths = manager.config['packages'][1]["remappings"]

package_name = manager.config['packages'][2]['name']
package_uri = manager.config['packages'][2]['URL']

In [None]:
xmlfiles = [file for file in f.dir if (file[-3:] in ['xsd', 'xml'])]
print(len(xmlfiles))

In [None]:
roots = list()

for file in xmlfiles:
    fp = f.fs.open(file, "r")
    content = fp.read()
    roots.append(etree.fromstring(content, parser=etree.XMLParser(remove_comments=True)))

In [None]:
namespaces = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
              'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
              'xl': 'http://www.xbrl.org/2003/XLink',
              'xsd': 'http://www.w3.org/2001/XMLSchema',
              'ext': 'http://eiopa.europa.eu/xbrl/ext',
              'xbrll': 'https://w3id.org/vocab/xbrll',
              's2c_dom': 'http://eiopa.europa.eu/xbrl/s2c/dict/dom'}
                          
for idx, root in enumerate(roots):
    ns = root.nsmap.copy()
    ns = dict((k.lower(), v.lower()) for k,v in ns.items() if (k is not None) and (v is not None))
    namespaces = {**namespaces, **root.nsmap}
if None in namespaces.keys():
    del namespaces[None]

In [None]:
g = rdflib.Graph()

def process_files(process_function, package_name, extension, output_file):

    params = {'linkNumber' : 0, 'namespaces': namespaces}

    prefix = '# RDF triples (in turtle syntax) imported from XBRL resource\n'
    prefix += '# URI:  '+package_uri+'\n'
    prefix += '# name: '+package_name+'\n\n'
    prefix += '# the namespaces\n'
    for namespace in namespaces.keys():
        if '#' in namespaces[namespace]:
            prefix += "@prefix "+namespace.lower()+": <"+namespaces[namespace].lower()+">.\n"
        else:
            prefix += "@prefix "+namespace.lower()+": <"+namespaces[namespace].lower()+"/>.\n"
    prefix += '\n'

    s = prefix
    for idx, root in enumerate(roots[0:1000]):
        params['output'] = StringIO()
        params['base'] = "http://"+"/".join(xmlfiles[idx].split("/")[1:-1])
        if xmlfiles[idx][-3:]==extension:
            params = process_function(root, params)
        string = params['output'].getvalue().replace('\u2264', '')
        try:
            g.parse(data=prefix+string, format='turtle')
            s += string
        except:
            print("error in: " +str(idx) + ": "+ xmlfiles[idx])
            print(string)

    fp = open(join(output_dir, output_file), "w", encoding = "utf-8")
    fp.write(s)
    fp.close()

process_files(process_schema.processSchema, package_name, 'xsd', "schema.ttl")
process_files(process_linkbase.processLinkBase, package_name, 'xml', "linkbase.ttl")

In [None]:
# print the number of "triples" in the Graph
print("graph has {} statements.".format(len(g)))