# Rich Context: Knowledge Graph Visualization

This notebook loads the Rich Context knowledge graph from the `tmp.jsonld` JSON-LD file prepared by the `RCGraph` workflow.
It runs graph analytics on the KG using the `NetworkX` library, then creates an interactive visualization using the `PyVis` library.

The following installations are needed, if these libraries haven't already been installed:

In [29]:
!pip install pyvis
!pip install networkx
!pip install pandas
!pip install numpy



Load the KG from the `tmp.jsonld` file…

In [30]:
import json
import sys

IDS = []
LABELS = {}

publications = {}
providers = {}
datasets = {}
journals = {}


def get_id (id):
    """ lookup the numeric ID for an element
    """
    global IDS
    return int(IDS.index(id))


def parse_metadata (elem):
    """ parse the required metadata items from one element in the graph
    """
    global IDS, LABELS
    
    kind = elem["@type"]
    
    #print(elem["dct:title"])
    title = elem["dct:title"]["@value"]

    id = elem["@id"].split("#")[1]
    IDS.append(id)
    LABELS[get_id(id)] = title

    return id, kind, title


# input the corpus from the JSON-LD file

filename = "tmp.jsonld"

with open(filename, "r") as f:
    jld_corpus = json.load(f)
    corpus = jld_corpus["@graph"]
    
# report summary stats

print(f"{len(corpus)} corpus elements")

2521 corpus elements


In [31]:
# load the providers

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Provider":
        if "dct:identifier" in elem:
            ror = elem["dct:identifier"]["@value"]
        else:
            ror = ""

        view = {
            "id": id,
            "title": title,
            "ror": ror
        }

        providers[id] = view

# load the datasets

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Dataset":
        prov_id = elem["dct:publisher"]["@value"]

        view = {
            "id": id,
            "title": title,
            "provider": prov_id
        }

        datasets[id] = view

# load the journals

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Journal":
        if "dct:identifier" in elem:
            issn = elem["dct:identifier"]["@value"]
        else:
            issn = ""

        view = {
            "id": id,
            "title": title,
            "issn": issn
        }

        journals[id] = view

# load the publications

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "ResearchPublication":
        dat_list = []
        c = elem["cito:citesAsDataSource"]

        if isinstance(c, dict):
            c = [c]
            
        for d in c:
            dat_id = d["@id"].split("#")[1]
            datasets[dat_id]["used"] = True
            dat_list.append(dat_id)

            prov_id = datasets[dat_id]["provider"]
            providers[prov_id]["used"] = True
                
        if "dct:identifier" in elem:
            doi = elem["dct:identifier"]["@value"]
        else:
            doi = ""

        if "dct:publisher" in elem:
            jour_id = elem["dct:publisher"]["@id"].split("#")[1]
            journals[jour_id]["used"] = True
        else:
            journal = None

        view = {
            "id": id,
            "title": title,
            "doi": doi,
            "journal": jour_id,
            "datasets": dat_list
        }

        publications[id] = view

# report summary stats

print(f"{len(publications)} publications")
print(f"{len(journals)} journals")
print(f"{len(providers)} providers")
print(f"{len(datasets)} datasets")

1491 publications
568 journals
287 providers
175 datasets


Calculate graph analytics…

In [32]:
import networkx as nx

nxg = nx.Graph()

for p in providers.values():
    if "used" in p:
        nxg.add_node(get_id(p["id"]))

for d in datasets.values():
    if "used" in d:
        nxg.add_node(get_id(d["id"]))
        nxg.add_edge(get_id(d["id"]), get_id(d["provider"]))

for j in journals.values():
    if "used" in j:
        nxg.add_node(get_id(j["id"]))

for p in publications.values():
    nxg.add_node(get_id(p["id"]))

    if p["journal"]:
        nxg.add_edge(get_id(p["id"]), get_id(p["journal"]))

    for d in p["datasets"]:
        nxg.add_edge(get_id(p["id"]), get_id(d))
    
#graph.add_edge(node0, node1, weight=self.edge_weight)
#graph.edge_betweenness_centrality

Run quantile analysis on he centrality results, to assess the relative impact of each element in the KG…

In [33]:
import numpy as np
import pandas as pd

def calc_quantiles (metrics, num_q):
    """ calculate `num` quantiles for the given list                                                                             
    """
    bins = np.linspace(0, 1, num=num_q, endpoint=True)
    s = pd.Series(metrics)
    q = s.quantile(bins, interpolation="nearest")

    try:
        dig = np.digitize(metrics, q) - 1
    except ValueError as e:
        print("ValueError:", str(e), metrics, s, q, bins)
        sys.exit(-1)

    quantiles = []

    for idx, q_hi in q.iteritems():
        quantiles.append(q_hi)

    return quantiles

In [34]:
from operator import itemgetter
from scipy.stats import percentileofscore

result = nx.pagerank(nxg)
#result = nx.edge_betweenness_centrality(nxg)
ranks = list(result.values())

quant = calc_quantiles(ranks, num_q=10)
num_quant = len(quant)

In [35]:
nxg_set = set([])

for n in nxg.nodes:
    nxg_set.add(int(n))

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    if id not in nxg_set:
        print(f"{id} not in nxg_set")
    if id not in LABELS:
        print(f"{IDS[id]} not in LABELS")

In [36]:
IDS_SCALE = {}

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    impact = percentileofscore(ranks, rank)
    scale = (((impact / num_quant) + 5) * 2)
    IDS_SCALE[id] = [int(round(scale)), impact / 100.0]
    print("{:^5}\t{:.4f}\t{:.4f}\t{}".format(id, rank, impact / 100.0, LABELS[id]))

 517 	0.0353	1.0000	National Health and Nutrition Examination Survey
2520 	0.0278	0.9996	Supplemental Nutrition Assistance Program
1118 	0.0232	0.9991	Women, Infants, and Children
 310 	0.0128	0.9987	National Longitudinal Study of Adolescent to Adult Health
2056 	0.0125	0.9983	Food Security Survey Module
 481 	0.0074	0.9979	SSRN Electronic Journal
 119 	0.0064	0.9974	PloS ONE
2242 	0.0064	0.9970	Public Health Nutr
1740 	0.0054	0.9966	Microdatabase Direct investment
1884 	0.0053	0.9961	Current Population Survey Food Security Supplement
 690 	0.0046	0.9957	IRI Infoscan
1820 	0.0044	0.9953	Monthly balance sheet statistics
 838 	0.0037	0.9948	Pediatrics
2442 	0.0032	0.9944	BMC Public Health
2366 	0.0030	0.9940	Survey of Doctorate Recipients
1498 	0.0029	0.9936	Frankfurt a. M.: Deutsche Bundesbank
1346 	0.0028	0.9931	Diabetes care
1471 	0.0026	0.9927	Three-City Study
1144 	0.0024	0.9923	Survey of Earned Doctorates
 623 	0.0024	0.9918	Maryland Unemployment Insurance
1017 	0.0024	0.9914	Quart

 142 	0.0003	0.6103	No More Credit Score': Employer Credit Check Bans and Signal Substitution
 106 	0.0003	0.6097	Massachusetts Department of Revenue Longitudinal Database
 224 	0.0003	0.6097	Massachusetts Department of Revenue Longitudinal Employer Filings
1586 	0.0003	0.6090	Changes in association between school meals and children's dietary quality during implementation of the Healthy, Hunger-Free Kids Act of 2010
  8  	0.0003	0.6086	Terms of endearment: An equilibrium model of sex and matching
 732 	0.0003	0.6079	Policy should change to improve invited speaker diversity and reflect trainee diversity
2460 	0.0003	0.6079	A data-based guide to the North American ecology faculty job market
2465 	0.0003	0.6073	The European Journal of Finance
1357 	0.0003	0.6067	Notes from the Field: Environmental Contamination from E-cigarette, Cigarette, Cigar, and Cannabis Products at 12 High Schools - San Francisco Bay Area, 2018-2019
1382 	0.0003	0.6067	Characteristics of School-Associated Youth Homi

1500 	0.0003	0.2180	SNAP-Ed (Supplemental Nutrition Assistance Program-Education) Increases Long-Term Food Security among Indiana Households with Children in a Randomized Controlled Study
1992 	0.0003	0.2180	Psychological Distress Mediates the Association between Food Insecurity and Suboptimal Sleep Quality in Latinos with Type 2 Diabetes Mellitus
1177 	0.0003	0.2167	Development and validation of an Arab family food security scale
 189 	0.0003	0.2163	Eur Econ Rev
 25  	0.0003	0.2139	Food insecurity and emotional health in the USA: a systematic narrative review of longitudinal research
 661 	0.0003	0.2139	Risk factors associated with the presence and severity of food insecurity in rural Honduras
 736 	0.0003	0.2139	Associations between socio-economic status and school-day dietary intake in a sample of grade 5-8 students in Vancouver, Canada
 861 	0.0003	0.2139	Food insecurity, overweight and obesity among low-income African-American families in Baltimore City: associations with food-rel

Use the `constrain()` function to constrain the graph to the neighborhood of a specified node. This is based on a breadth-first search, with a `limit` parameter to constrain the diameter of the neighborhood in the graph.

In [37]:
SUBGRAPH = nxg_set

def constrain (limit, search_term):
    global SUBGRAPH
    
    for node_id, label in LABELS.items():
        if label == search_term:
            r = nx.bfs_edges(nxg, source=node_id, depth_limit=limit)
            SUBGRAPH = set([node_id])

            for _, neighbor in r:
                SUBGRAPH.add(neighbor)


#constrain(limit=4, search_term="NOAA")
print(len(SUBGRAPH))

2330


Generate an interactive visualization…

In [38]:
from pyvis.network import Network

g = Network(notebook=True, height="1000px", width="100%")
g.force_atlas_2based()

for p in providers.values():
    if "used" in p:
        p_id = get_id(p["id"])
        
        if p_id in SUBGRAPH:
            scale, impact = IDS_SCALE[p_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["ror"])
            g.add_node(p_id, label=p["title"], title=title, color="orange", size=scale)

for d in datasets.values():
    if "used" in d:
        d_id = get_id(d["id"])
        
        if d_id in SUBGRAPH:
            p_id = get_id(d["provider"])
            scale, impact = IDS_SCALE[d_id]
            title = "{}<br/>rank: {:.4f}<br/>provider: {}".format(d["title"], impact, LABELS[p_id])
            g.add_node(d_id, label=d["title"], title=title, color="red", size=scale)

            if p_id in SUBGRAPH:
                g.add_edge(d_id, p_id, color="gray")

for j in journals.values():
    if "used" in j:
        j_id = get_id(j["id"])

        if j_id in SUBGRAPH:
            scale, impact = IDS_SCALE[j_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(j["title"], impact, j["issn"])
            g.add_node(j_id, label=j["title"], title=title, color="green", size=scale)

for p in publications.values():
    p_id = get_id(p["id"])

    if p_id in SUBGRAPH:
        scale, impact = IDS_SCALE[p_id]
        title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["doi"])
        g.add_node(p_id, label=p["title"], title=title, color="blue", size=scale)

        if p["journal"]:
            j_id = get_id(p["journal"])

            if j_id in SUBGRAPH:
                g.add_edge(p_id, j_id, color="gray")

        for d in p["datasets"]:
            d_id = get_id(d)
            
            if d_id in SUBGRAPH:
                g.add_edge(p_id, d_id, color="gray")

g.show_buttons()
g.show("corpus.html")

In [39]:
from IPython.core.display import display, HTML

frags = []
html_frag = "<div><span style=\"color:{}; font-size:20px;\">&#x25CF;</span> <strong>{}</strong> {}</div>"

num_prov = len(SUBGRAPH.intersection(set([get_id(p) for p in providers.keys()])))
frags.append(html_frag.format("orange", num_prov, "providers"))

num_data = len(SUBGRAPH.intersection(set([get_id(d) for d in datasets.keys()])))
frags.append(html_frag.format("red", num_data, "datasets"))

num_jour = len(SUBGRAPH.intersection(set([get_id(j) for j in journals.keys()])))
frags.append(html_frag.format("green", num_jour, "journals"))

num_pubs = len(SUBGRAPH.intersection(set([get_id(p) for p in publications.keys()])))
frags.append(html_frag.format("blue", num_pubs, "publications"))

display(HTML("".join(frags)))