In [None]:
%pylab inline

In [None]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import rdflib
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
from networkx import Graph as NXGraph
import matplotlib.pyplot as plt
from os.path import isfile
from os import listdir

load both the taxonomy and the instances

In [None]:
g = rdflib.Graph()

taxonomy_files = ['schema2.ttl']#,'linkbase2.ttl']
handle = dataiku.Folder("taxonomies")
file_path = handle.get_path() + "/processed/"
for taxonomy_file in taxonomy_files:
    input_file = open(file_path + taxonomy_file, encoding = 'utf-8')
    g.parse(file=input_file, format="turtle")
    input_file.close()
    print('added {} to graph g'.format(taxonomy_file))

handle = dataiku.Folder("instances")
file_path = handle.get_path() + "/real/processed/"
ttl_files = [f for f in listdir(file_path) if (isfile(file_path + f)) and f[-3:].lower()=='ttl']
for ttl_file in ttl_files:
    instance_file = open(file_path + ttl_file, encoding = 'utf-8')
    g.parse(file=instance_file, format="turtle")
    instance_file.close()
    print('added {} to graph g'.format(ttl_file))

In [None]:
print("graph has {} statements.".format(len(g)))

Select a random context and make a new graph by slecting outwards from the context

In [None]:
g_lite = rdflib.Graph()
list_a = list(g.triples((None, None, rdflib.term.URIRef('https://w3id.org/vocab/xbrll/Context'))))
for idx_a in range(1):
    g_lite.add(list_a[idx_a])
    list_b = list(g.triples((list_a[idx_a][0], None, None)))
    for idx_b in range(min(6,len(list_b))):
        g_lite.add(list_b[idx_b])
        list_c = list(g.triples((list_b[idx_b][2], None, None)))
        for idx_c in range(min(8,len(list_c))):
            g_lite.add(list_c[idx_c])
            list_d = list(g.triples((list_c[idx_c][2], None, None)))
            for idx_d in range(min(8,len(list_d))):
                g_lite.add(list_d[idx_d])
                list_e = list(g.triples((list_d[idx_d][2], None, None)))
                for idx_e in range(min(8,len(list_e))):
                    g_lite.add(list_e[idx_e])
    list_b = list(g.triples((None, None, list_a[idx_a][0])))
    for idx_b in range(min(6,len(list_b))):
        g_lite.add(list_b[idx_b])
        list_c = list(g.triples((list_b[idx_b][0], None, None)))
        for idx_c in range(min(8,len(list_c))):
            g_lite.add(list_c[idx_c])
print("graph has {} statements.".format(len(g_lite)))

Convert the graph to an networkx graph and define the edge and node labels

In [None]:
nxgraph = rdflib_to_networkx_multidigraph(g_lite)

In [None]:
edge_labels = {}
for edge in nxgraph.edges(keys=True, data=True):
    edge_labels[edge[0],edge[1]] = str(edge[2]).split('/')[-1]

In [None]:
node_labels = {}
for node in nxgraph.nodes:    
    if isinstance(node, rdflib.term.URIRef):
        node_labels[node] = str(node).split("/")[-1].replace("instance", "")
    elif not isinstance(node, rdflib.term.BNode):
        node_labels[node] = str(node).replace(" ", "\n")[0:30]

Plot the network

In [None]:
df = pd.DataFrame(index=nxgraph.nodes(), columns=nxgraph.nodes())
for row, data in nx.shortest_path_length(nxgraph):
    for col, dist in data.items():
        df.loc[row,col] = dist

df = df.fillna(df.max().max())

pos = nx.kamada_kawai_layout(nxgraph, dist=df.to_dict())

# Plot
print("Visualizing the graph:")

plt.figure(1, figsize = (30, 20)) 

nx.draw_networkx_edge_labels(nxgraph, pos = pos, edge_labels = edge_labels, font_color='black', font_size = 10)
nx.draw_networkx_labels(nxgraph, pos, labels = node_labels, font_size = 10)
nx.draw(nxgraph, 
        pos,
        edge_color = 'black', 
        with_labels = False, 
        node_color='black',
        node_size = 3250, 
        alpha= 0.1,
        width= 1,
        font_weight= 'regular')
ax = plt.gca()
ax.collections[0].set_edgecolor("#555555") 
plt.show()