# Rich Context: Knowledge Graph Visualization

This notebook loads the Rich Context knowledge graph from the `tmp.jsonld` JSON-LD file prepared by the `RCGraph` workflow.
It runs graph analytics on the KG using the `NetworkX` library, then creates an interactive visualization using the `PyVis` library.

The following installations are needed, if these libraries haven't already been installed:

In [1]:
!pip install pyvis
!pip install networkx
!pip install pandas
!pip install numpy



Load the KG from the `tmp.jsonld` file…

In [41]:
import json
import sys

IDS = []
LABELS = {}

publications = {}
providers = {}
datasets = {}
journals = {}


def get_id (id):
    """ lookup the numeric ID for an element
    """
    global IDS
    return int(IDS.index(id))


def parse_metadata (elem):
    """ parse the required metadata items from one element in the graph
    """
    global IDS, LABELS
    
    kind = elem["@type"]
    title = elem["dct:title"]["@value"]

    id = elem["@id"].split("#")[1]
    IDS.append(id)
    LABELS[get_id(id)] = title

    return id, kind, title


# input the corpus from the JSON-LD file

filename = "tmp.jsonld"

with open(filename, "r") as f:
    jld_corpus = json.load(f)
    corpus = jld_corpus["@graph"]
    
# report summary stats

print(f"{len(corpus)} corpus elements")

3386 corpus elements


In [42]:
# load the providers

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Provider":
        view = {
            "id": id,
            "title": title
        }

        providers[id] = view

# load the datasets

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Dataset":
        prov_id = elem["dct:publisher"]["@value"]

        view = {
            "id": id,
            "title": title,
            "provider": prov_id
        }

        datasets[id] = view

# load the journals

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Journal":
        if "dct:identifier" in elem:
            issn = elem["dct:identifier"]["@value"]
        else:
            issn = ""

        view = {
            "id": id,
            "title": title,
            "issn": issn
        }

        journals[id] = view

# load the publications

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "ResearchPublication":
        dat_list = []
        c = elem["cito:citesAsDataSource"]

        if isinstance(c, dict):
            c = [c]
            
        for d in c:
            dat_id = d["@id"].split("#")[1]
            datasets[dat_id]["used"] = True
            dat_list.append(dat_id)

            prov_id = datasets[dat_id]["provider"]
            providers[prov_id]["used"] = True
                
        if "dct:identifier" in elem:
            doi = elem["dct:identifier"]["@value"]

            if not doi.startswith("10."):
                doi = ""

        if "dct:publisher" in elem:
            jour_id = elem["dct:publisher"]["@id"].split("#")[1]
            journals[jour_id]["used"] = True
        else:
            journal = None

        view = {
            "id": id,
            "title": title,
            "doi": doi,
            "journal": jour_id,
            "datasets": dat_list
        }

        publications[id] = view

# report summary stats

print(f"{len(publications)} publications")
print(f"{len(journals)} journals")
print(f"{len(providers)} providers")
print(f"{len(datasets)} datasets")

1473 publications
1027 journals
274 providers
612 datasets


Calculate graph analytics…

In [49]:
import networkx as nx

nxg = nx.Graph()

for p in providers.values():
    if "used" in p:
        nxg.add_node(get_id(p["id"]))

for d in datasets.values():
    if "used" in d:
        nxg.add_node(get_id(d["id"]))
        nxg.add_edge(get_id(d["id"]), get_id(d["provider"]))

for j in journals.values():
    if "used" in j:
        nxg.add_node(get_id(j["id"]))

for p in publications.values():
    nxg.add_node(get_id(p["id"]))

    if p["journal"]:
        nxg.add_edge(get_id(p["id"]), get_id(p["journal"]))

    for d in p["datasets"]:
        nxg.add_edge(get_id(p["id"]), get_id(d))
    
#graph.add_edge(node0, node1, weight=self.edge_weight)
#graph.edge_betweenness_centrality

Run quantile analysis on he centrality results, to assess the relative impact of each element in the KG…

In [50]:
import numpy as np
import pandas as pd

def calc_quantiles (metrics, num_q):
    """ calculate `num` quantiles for the given list                                                                             
    """
    bins = np.linspace(0, 1, num=num_q, endpoint=True)
    s = pd.Series(metrics)
    q = s.quantile(bins, interpolation="nearest")

    try:
        dig = np.digitize(metrics, q) - 1
    except ValueError as e:
        print("ValueError:", str(e), metrics, s, q, bins)
        sys.exit(-1)

    quantiles = []

    for idx, q_hi in q.iteritems():
        quantiles.append(q_hi)

    return quantiles

In [51]:
from operator import itemgetter
from scipy.stats import percentileofscore

result = nx.pagerank(nxg)
#result = nx.edge_betweenness_centrality(nxg)
ranks = list(result.values())

quant = calc_quantiles(ranks, num_q=10)
num_quant = len(quant)

In [52]:
nxg_set = set([])

for n in nxg.nodes:
    nxg_set.add(int(n))

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    if id not in nxg_set:
        print(f"{id} not in nxg_set")
    if id not in LABELS:
        print(f"{IDS[id]} not in LABELS")

In [53]:
IDS_SCALE = {}

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    impact = percentileofscore(ranks, rank)
    scale = (((impact / num_quant) + 5) * 2)
    IDS_SCALE[id] = [int(round(scale)), impact / 100.0]
    print("{:^5}\t{:.4f}\t{:.4f}\t{}".format(id, rank, impact / 100.0, LABELS[id]))

2106 	0.0368	1.0000	National Health and Nutrition Examination Survey
1845 	0.0257	0.9996	Supplemental Nutrition Assistance Program
 821 	0.0234	0.9991	Women, Infants, and Children
 615 	0.0141	0.9987	Food Security Survey Module
2968 	0.0131	0.9982	National Longitudinal Study of Adolescent to Adult Health
1563 	0.0072	0.9978	SSRN Electronic Journal
2954 	0.0067	0.9974	Public Health Nutr
1358 	0.0067	0.9969	PloS ONE
 395 	0.0062	0.9965	Current Population Survey Food Security Supplement
1474 	0.0055	0.9960	Microdatabase Direct investment
2900 	0.0050	0.9956	IRI Infoscan
3172 	0.0045	0.9952	Monthly balance sheet statistics
 28  	0.0039	0.9947	Pediatrics
2916 	0.0032	0.9943	BMC Public Health
2837 	0.0030	0.9938	Frankfurt a. M.: Deutsche Bundesbank
 156 	0.0030	0.9934	Diabetes care
3326 	0.0027	0.9929	Higher Education Research and Development Survey
 25  	0.0027	0.9925	Survey of Earned Doctorates
 962 	0.0027	0.9921	Three-City Study
1157 	0.0025	0.9916	Maryland Unemployment Insurance
2597 	0

1270 	0.0003	0.6166	Not in my backyard? Not so fast. The effect of marijuana legalization on neighborhood crime
 550 	0.0003	0.6159	You can't always get what you want: The impact of the UK Jobseeker's Allowance
1648 	0.0003	0.6159	Monitoring job search effort: An evaluation based on a regression discontinuity design
2465 	0.0003	0.6152	Terms of endearment: An equilibrium model of sex and matching
1623 	0.0003	0.6148	Divided we reform? Evidence from US welfare policies
1301 	0.0003	0.6144	BAN THE BOX, CONVICTIONS, AND PUBLIC EMPLOYMENT
3146 	0.0003	0.6139	Making parents pay: The unintended consequences of charging parents for foster care
1635 	0.0003	0.6135	Changes in association between school meals and children's dietary quality during implementation of the Healthy, Hunger-Free Kids Act of 2010
3026 	0.0003	0.6130	The European Journal of Finance
1072 	0.0003	0.6126	The bachelor's to Ph.D. STEM pipeline no longer leaks more women than men: a 30-year analysis
1699 	0.0003	0.6122	RSF: Th

 99  	0.0003	0.2539	Sharing data for public health research by members of an international online diabetes social network
 207 	0.0003	0.2539	Direct measurement of perchlorate exposure biomarkers in a highly exposed population: a pilot study
 302 	0.0003	0.2539	Differences in self-reported health in the Osteoarthritis Initiative (OAI) and Third National Health and Nutrition Examination Survey (NHANES-III)
 404 	0.0003	0.2539	Milk intake and total dairy consumption: Associations with early menarche in NHANES 1999-2004
 657 	0.0003	0.2539	Variation in LPA is associated with Lp(a) levels in three populations from the Third National Health and Nutrition Examination Survey
1000 	0.0003	0.2539	Evidence for a peak shift in a humoral response to helminths: Age profiles of IgE in the Shuar of Ecuador, the Tsimane of Bolivia, and the U.S. NHANES
1162 	0.0003	0.2539	Hypertension in women of reproductive age in the United States: NHANES 1999-2008
1245 	0.0003	0.2539	Association of urinary bispheno

Generate an interactive visualization…

In [57]:
from pyvis.network import Network

g = Network(notebook=True, height="1000px", width="100%")
g.force_atlas_2based()

for p in providers.values():
    if "used" in p:
        id = get_id(p["id"])
        scale, impact = IDS_SCALE[id]
        title = "{}<br/>rank: {:.4f}".format(p["title"], impact)
        
        g.add_node(id, label=p["title"], title=title, color="orange", size=scale)

for d in datasets.values():
    if "used" in d:
        id = get_id(d["id"])
        scale, impact = IDS_SCALE[id]
        title = "{}<br/>provider: {}<br/>rank: {:.4f}".format(d["title"], LABELS[get_id(d["provider"])], impact)
        
        g.add_node(id, label=d["title"], title=title, color="red", size=scale)
        g.add_edge(get_id(d["id"]), get_id(d["provider"]), color="gray")

for j in journals.values():
    if "used" in j:
        id = get_id(j["id"])
        scale, impact = IDS_SCALE[id]
        title = "{}<br/>issn: {}<br/>rank: {:.4f}".format(j["title"], j["issn"], impact)
        g.add_node(id, label=j["title"], title=title, color="green", size=scale)

for p in publications.values():
    id = get_id(p["id"])
    scale, impact = IDS_SCALE[id]
    title = "{}<br/>doi: {}<br/>rank: {:.4f}".format(p["title"], p["doi"], impact)
    g.add_node(id, label=p["title"], title=title, color="blue", size=scale)

    if p["journal"]:
        g.add_edge(get_id(p["id"]), get_id(p["journal"]), color="gray")

    for d in p["datasets"]:
        g.add_edge(get_id(p["id"]), get_id(d), color="gray")

g.show_buttons()
g.show("corpus.html")