In [None]:
%pylab inline

In [None]:
dataiku_env = True
try:
    import dataiku
except:
    dataiku_env = False

In [None]:
import rdflib
import pandas as pd
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
from networkx import Graph as NXGraph
import matplotlib.pyplot as plt
from os.path import isfile, join
from os import listdir

In [None]:
if dataiku_env:
    handle = dataiku.Folder("taxonomies")
    taxonomy_file_path = handle.get_path() + "/processed/"
    handle = dataiku.Folder("external")
    external_file_path = handle.get_path() + "/"
    handle = dataiku.Folder("instances")
    instance_file_path = handle.get_path() + "/real/processed/"
else:
    taxonomy_file_path = join('..', 'data', 'rdf', 'xbrl', 'taxonomies')
    instance_file_path = join('..', 'data', 'rdf', 'xbrl', 'instances')
    external_eiopa_path = join('..', 'data', 'external', 'eiopa')
    external_gleif_path = join('..', 'data', 'external', 'gleif')    

load both the taxonomy and the instances

In [None]:
g = rdflib.Graph()

taxonomy_files = ['schema_optimized.ttl']#,'linkbase2.ttl']
for ttl_file in taxonomy_files:
    with open(join(taxonomy_file_path, ttl_file), 'rb') as file:
        g.parse(data=file.read(), format="turtle")
        print('added {} to graph g'.format(ttl_file))
        
instance_files = [f for f in listdir(instance_file_path) if (isfile(join(instance_file_path, f))) and f[-3:].lower()=='ttl']
for ttl_file in instance_files:
    with open(join(instance_file_path, ttl_file), 'rb') as file:
        g.parse(data=file.read(), format="turtle")
        print('added {} to graph g'.format(ttl_file))
    
external_eiopa_files = [f for f in listdir(external_eiopa_path) if (isfile(join(external_eiopa_path, f))) and f[-3:].lower()=='ttl']
for ttl_file in external_eiopa_files:
    with open(join(external_eiopa_path, ttl_file), 'rb') as file:
        g.parse(data=file.read(), format="turtle")
        print('added {} to graph g'.format(ttl_file))
        
external_gleif_files = [f for f in listdir(external_gleif_path) if (isfile(join(external_gleif_path, f))) and f[-3:].lower()=='ttl']
for ttl_file in external_gleif_files:
    with open(join(external_gleif_path, ttl_file), 'rb') as file:
        g.parse(data=file.read(), format="turtle")
        print('added {} to graph g'.format(ttl_file))        

In [None]:
print("graph has {} statements.".format(len(g)))

Select a random context and make a new graph by slecting outwards from the context

In [None]:
def select_part_graph(g, begin_triple, from_connections, to_connections):

    """
    g:
        the graph to explore
    begin_triple:
        the triple from which the graph is explored
    from_connections :
        is a list containing the max number of upstream nodes which should be explored 
    to_connections:
        is a list containing the max number of downstream nodes which should be explored
    """

    g_lite = rdflib.Graph()
    
    max_number_of_steps = len(from_connections)
    if len(from_connections) != len(to_connections):
        raise AssertionError('length of lists are not equal') 
    
    previous_list = [begin_triple[0]]
    for step in range(max_number_of_steps):
        new_list = []
        for triple_inst in previous_list:
            total_list_down = list(g.triples((triple_inst, None, None)))
            total_list_up = list(g.triples((None, None, triple_inst)))
                        
            for idx in range(min(from_connections[step],len(total_list_up))):
                new_list.append(total_list_up[idx][0])
                g_lite.add(total_list_up[idx])                           
                
            for idx in range(min(to_connections[step],len(total_list_down))):
                new_list.append(total_list_down[idx][2])
                g_lite.add(total_list_down[idx])                           
                
        previous_list = list(set(new_list))
        
    return g_lite

In [None]:
list_a = list(g.triples((None, None, rdflib.term.URIRef('https://w3id.org/vocab/xbrll/Context'))))
# list_a = list(g.triples((None, rdflib.term.URIRef('https://www.gleif.org/ontology/l1/LEI'), rdflib.term.Literal('72450051YQLIROHV2228'))))
triple = list_a[26]
g_lite = select_part_graph(g, triple, [0, 0, 0, 0, 0, 0, 0], [8, 8, 8, 8, 8, 8, 8])
print("graph has {} statements.".format(len(g_lite)))

Convert the graph to an networkx graph and define the edge and node labels

In [None]:
nxgraph = rdflib_to_networkx_multidigraph(g_lite)

In [None]:
edge_labels = {}
for edge in nxgraph.edges(keys=True, data=True):
    edge_labels[edge[0],edge[1]] = str(edge[2]).split('/')[-1]

In [None]:
node_labels = {}
for node in nxgraph.nodes:    
    if isinstance(node, rdflib.term.URIRef):
        node_labels[node] = str(node).split("/")[-1].replace("instance", "")
    elif not isinstance(node, rdflib.term.BNode):
        node_labels[node] = str(node).replace(" ", "\n")#[0:30]

Plot the network

In [None]:
df = pd.DataFrame(index=nxgraph.nodes(), columns=nxgraph.nodes())
for row, data in nx.shortest_path_length(nxgraph):
    for col, dist in data.items():
        df.loc[row,col] = dist

df = df.fillna(df.max().max())

pos = nx.kamada_kawai_layout(nxgraph, dist=df.to_dict())
#pos = nx.spring_layout(nxgraph)

# Plot
print("Visualizing the graph:")

plt.figure(1, figsize = (30, 20)) 

nx.draw_networkx_edge_labels(nxgraph, pos = pos, edge_labels = edge_labels, font_color='black', font_size = 10)
nx.draw_networkx_labels(nxgraph, pos, labels = node_labels, font_size = 10)
nx.draw(nxgraph, 
        pos,
        edge_color = 'black', 
        with_labels = False, 
        node_color='black',
        node_size = 3250, 
        arrowsize=40, 
        alpha= 0.1,
        width= 1,
        font_weight= 'regular')
ax = plt.gca()
ax.collections[0].set_edgecolor("#555555") 
plt.show()