# Rich Context: Knowledge Graph Visualization

This notebook loads the Rich Context knowledge graph from the `tmp.jsonld` JSON-LD file prepared by the `RCGraph` workflow.
It runs graph analytics on the KG using the `NetworkX` library, then creates an interactive visualization using the `PyVis` library.

The following installations are needed, if these libraries haven't already been installed:

In [1]:
!pip install pyvis
!pip install networkx
!pip install pandas
!pip install numpy



Load the KG from the `tmp.jsonld` file…

In [1]:
import json
import sys

IDS = []
LABELS = {}

publications = {}
providers = {}
datasets = {}
journals = {}


def get_id (id):
    """ lookup the numeric ID for an element
    """
    global IDS
    return int(IDS.index(id))


def parse_metadata (elem):
    """ parse the required metadata items from one element in the graph
    """
    global IDS, LABELS
    
    kind = elem["@type"]
    title = elem["dct:title"]["@value"]

    id = elem["@id"].split("#")[1]
    IDS.append(id)
    LABELS[get_id(id)] = title

    return id, kind, title


# input the corpus from the JSON-LD file

filename = "tmp.jsonld"

with open(filename, "r") as f:
    jld_corpus = json.load(f)
    corpus = jld_corpus["@graph"]
    
# report summary stats

print(f"{len(corpus)} corpus elements")

3424 corpus elements


In [2]:
# load the providers

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Provider":
        if "dct:identifier" in elem:
            ror = elem["dct:identifier"]["@value"]
        else:
            ror = ""

        view = {
            "id": id,
            "title": title,
            "ror": ror
        }

        providers[id] = view

# load the datasets

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Dataset":
        prov_id = elem["dct:publisher"]["@value"]

        view = {
            "id": id,
            "title": title,
            "provider": prov_id
        }

        datasets[id] = view

# load the journals

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Journal":
        if "dct:identifier" in elem:
            issn = elem["dct:identifier"]["@value"]
        else:
            issn = ""

        view = {
            "id": id,
            "title": title,
            "issn": issn
        }

        journals[id] = view

# load the publications

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "ResearchPublication":
        dat_list = []
        c = elem["cito:citesAsDataSource"]

        if isinstance(c, dict):
            c = [c]
            
        for d in c:
            dat_id = d["@id"].split("#")[1]
            datasets[dat_id]["used"] = True
            dat_list.append(dat_id)

            prov_id = datasets[dat_id]["provider"]
            providers[prov_id]["used"] = True
                
        if "dct:identifier" in elem:
            doi = elem["dct:identifier"]["@value"]
        else:
            doi = ""

        if "dct:publisher" in elem:
            jour_id = elem["dct:publisher"]["@id"].split("#")[1]
            journals[jour_id]["used"] = True
        else:
            journal = None

        view = {
            "id": id,
            "title": title,
            "doi": doi,
            "journal": jour_id,
            "datasets": dat_list
        }

        publications[id] = view

# report summary stats

print(f"{len(publications)} publications")
print(f"{len(journals)} journals")
print(f"{len(providers)} providers")
print(f"{len(datasets)} datasets")

1473 publications
1027 journals
284 providers
640 datasets


Calculate graph analytics…

In [3]:
import networkx as nx

nxg = nx.Graph()

for p in providers.values():
    if "used" in p:
        nxg.add_node(get_id(p["id"]))

for d in datasets.values():
    if "used" in d:
        nxg.add_node(get_id(d["id"]))
        nxg.add_edge(get_id(d["id"]), get_id(d["provider"]))

for j in journals.values():
    if "used" in j:
        nxg.add_node(get_id(j["id"]))

for p in publications.values():
    nxg.add_node(get_id(p["id"]))

    if p["journal"]:
        nxg.add_edge(get_id(p["id"]), get_id(p["journal"]))

    for d in p["datasets"]:
        nxg.add_edge(get_id(p["id"]), get_id(d))
    
#graph.add_edge(node0, node1, weight=self.edge_weight)
#graph.edge_betweenness_centrality

Run quantile analysis on he centrality results, to assess the relative impact of each element in the KG…

In [4]:
import numpy as np
import pandas as pd

def calc_quantiles (metrics, num_q):
    """ calculate `num` quantiles for the given list                                                                             
    """
    bins = np.linspace(0, 1, num=num_q, endpoint=True)
    s = pd.Series(metrics)
    q = s.quantile(bins, interpolation="nearest")

    try:
        dig = np.digitize(metrics, q) - 1
    except ValueError as e:
        print("ValueError:", str(e), metrics, s, q, bins)
        sys.exit(-1)

    quantiles = []

    for idx, q_hi in q.iteritems():
        quantiles.append(q_hi)

    return quantiles

In [5]:
from operator import itemgetter
from scipy.stats import percentileofscore

result = nx.pagerank(nxg)
#result = nx.edge_betweenness_centrality(nxg)
ranks = list(result.values())

quant = calc_quantiles(ranks, num_q=10)
num_quant = len(quant)

In [6]:
nxg_set = set([])

for n in nxg.nodes:
    nxg_set.add(int(n))

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    if id not in nxg_set:
        print(f"{id} not in nxg_set")
    if id not in LABELS:
        print(f"{IDS[id]} not in LABELS")

In [7]:
IDS_SCALE = {}

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    impact = percentileofscore(ranks, rank)
    scale = (((impact / num_quant) + 5) * 2)
    IDS_SCALE[id] = [int(round(scale)), impact / 100.0]
    print("{:^5}\t{:.4f}\t{:.4f}\t{}".format(id, rank, impact / 100.0, LABELS[id]))

3163 	0.0368	1.0000	National Health and Nutrition Examination Survey
1108 	0.0257	0.9996	Supplemental Nutrition Assistance Program
 561 	0.0234	0.9991	Women, Infants, and Children
1942 	0.0141	0.9987	Food Security Survey Module
2047 	0.0131	0.9982	National Longitudinal Study of Adolescent to Adult Health
1552 	0.0072	0.9978	SSRN Electronic Journal
2998 	0.0067	0.9974	Public Health Nutr
1492 	0.0067	0.9969	PloS ONE
2713 	0.0062	0.9965	Current Population Survey Food Security Supplement
 395 	0.0055	0.9960	Microdatabase Direct investment
3327 	0.0050	0.9956	IRI Infoscan
 285 	0.0045	0.9951	Monthly balance sheet statistics
2777 	0.0039	0.9947	Pediatrics
1069 	0.0032	0.9943	BMC Public Health
1762 	0.0030	0.9938	Frankfurt a. M.: Deutsche Bundesbank
 825 	0.0030	0.9934	Diabetes care
  6  	0.0027	0.9929	Higher Education Research and Development Survey
2550 	0.0027	0.9925	Survey of Earned Doctorates
1561 	0.0027	0.9921	Three-City Study
3168 	0.0025	0.9916	Maryland Unemployment Insurance
 940 	0

1728 	0.0003	0.6235	Am J Hematol
1785 	0.0003	0.6235	J Clin Med Res
2257 	0.0003	0.6235	BMC Med Genet
2811 	0.0003	0.6235	J Am Soc Nephrol
2880 	0.0003	0.6235	Nutrition and Metabolism
1475 	0.0003	0.6210	Plasma A42 and Total Tau Predict Cognitive Decline in Amnestic Mild Cognitive Impairment
2464 	0.0003	0.6210	Neuroprotective effects of oleic acid in rodent models of cerebral ischaemia
3151 	0.0003	0.6204	No More Credit Score': Employer Credit Check Bans and Signal Substitution
3383 	0.0003	0.6199	Geographic concentration and high tech firm survival
1845 	0.0003	0.6195	J Consum Aff
 318 	0.0003	0.6190	Fam Soc
 675 	0.0003	0.6184	Food insecurity and breastfeeding
2190 	0.0003	0.6184	Relation between household food insecurity and breastfeeding in Canada
 86  	0.0003	0.6177	Sector concentration in loan portfolios and economic capital
2703 	0.0003	0.6173	Not in my backyard? Not so fast. The effect of marijuana legalization on neighborhood crime
1665 	0.0003	0.6166	Monitoring job search ef

 297 	0.0003	0.2454	Blood mercury reporting in NHANES: identifying Asian, Pacific Islander, Native American, and multiracial groups
 433 	0.0003	0.2454	Bisphenol A and Peripheral Arterial Disease: Results from the NHANES
 519 	0.0003	0.2454	Impact of smoking and thiocyanate on perchlorate and thyroid hormone associations in the 2001-2002 National Health and Nutrition Examination Survey
 645 	0.0003	0.2454	Serum selenium concentrations and diabetes in U.S. adults: National Health and Nutrition Examination Survey (NHANES) 2003-2004
 731 	0.0003	0.2454	Association of environmental cadmium exposure with periodontal disease in U.S. adults
 955 	0.0003	0.2454	Bisphenol A data in NHANES suggest longer than expected half-life, substantial nonfood exposure, or both
1257 	0.0003	0.2454	Computational toxicology of chloroform: Reverse dosimetry using Bayesian inference, Markov chain Monte Carlo simulation, and human biomonitoring data
1550 	0.0003	0.2454	Cadmium exposure and hypertension in the 19

Use the `constrain()` function to constrain the graph to the neighborhood of a specified node. This is based on a breadth-first search, with a `limit` parameter to constrain the diameter of the neighborhood in the graph.

In [8]:
SUBGRAPH = nxg_set

def constrain (limit, search_term):
    global SUBGRAPH
    
    for node_id, label in LABELS.items():
        if label == search_term:
            r = nx.bfs_edges(nxg, source=node_id, depth_limit=limit)
            SUBGRAPH = set([node_id])

            for _, neighbor in r:
                SUBGRAPH.add(neighbor)


#constrain(limit=4, search_term="US Department of Agriculture")
print(len(SUBGRAPH))

2268


Generate an interactive visualization…

In [11]:
from pyvis.network import Network

g = Network(notebook=True, height="1000px", width="100%")
g.force_atlas_2based()

for p in providers.values():
    if "used" in p:
        p_id = get_id(p["id"])
        
        if p_id in SUBGRAPH:
            scale, impact = IDS_SCALE[p_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["ror"])
            g.add_node(p_id, label=p["title"], title=title, color="orange", size=scale)

for d in datasets.values():
    if "used" in d:
        d_id = get_id(d["id"])
        
        if d_id in SUBGRAPH:
            p_id = get_id(d["provider"])
            scale, impact = IDS_SCALE[d_id]
            title = "{}<br/>rank: {:.4f}<br/>provider: {}".format(d["title"], impact, LABELS[p_id])
            g.add_node(d_id, label=d["title"], title=title, color="red", size=scale)

            if p_id in SUBGRAPH:
                g.add_edge(d_id, p_id, color="gray")

for j in journals.values():
    if "used" in j:
        j_id = get_id(j["id"])

        if j_id in SUBGRAPH:
            scale, impact = IDS_SCALE[j_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(j["title"], impact, j["issn"])
            g.add_node(j_id, label=j["title"], title=title, color="green", size=scale)

for p in publications.values():
    p_id = get_id(p["id"])

    if p_id in SUBGRAPH:
        scale, impact = IDS_SCALE[p_id]
        title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["doi"])
        g.add_node(p_id, label=p["title"], title=title, color="blue", size=scale)

        if p["journal"]:
            j_id = get_id(p["journal"])

            if j_id in SUBGRAPH:
                g.add_edge(p_id, j_id, color="gray")

        for d in p["datasets"]:
            d_id = get_id(d)
            
            if d_id in SUBGRAPH:
                g.add_edge(p_id, d_id, color="gray")

g.show_buttons()
g.show("corpus.html")