# Rich Context: Knowledge Graph Visualization

This notebook loads the Rich Context knowledge graph from the `tmp.jsonld` JSON-LD file prepared by the `RCGraph` workflow.
It runs graph analytics on the KG using the `NetworkX` library, then creates an interactive visualization using the `PyVis` library.

The following installations are needed, if these libraries haven't already been installed:

In [1]:
!pip install pyvis
!pip install networkx
!pip install pandas
!pip install numpy



Load the KG from the `tmp.jsonld` file…

In [4]:
import json
import sys

IDS = []
LABELS = {}

publications = {}
providers = {}
datasets = {}
journals = {}


def get_id (id):
    """ lookup the numeric ID for an element
    """
    global IDS
    return int(IDS.index(id))


def parse_metadata (elem):
    """ parse the required metadata items from one element in the graph
    """
    global IDS, LABELS
    
    kind = elem["@type"]
    
    #print(elem["dct:title"])
    title = elem["dct:title"]["@value"]

    id = elem["@id"].split("#")[1]
    IDS.append(id)
    LABELS[get_id(id)] = title

    return id, kind, title


# input the corpus from the JSON-LD file

filename = "tmp.jsonld"

with open(filename, "r") as f:
    jld_corpus = json.load(f)
    corpus = jld_corpus["@graph"]
    
# report summary stats

print(f"{len(corpus)} corpus elements")

4893 corpus elements


In [5]:
# load the providers

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Provider":
        if "dct:identifier" in elem:
            ror = elem["dct:identifier"]["@value"]
        else:
            ror = ""

        view = {
            "id": id,
            "title": title,
            "ror": ror
        }

        providers[id] = view

# load the datasets

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Dataset":
        prov_id = elem["dct:publisher"]["@value"]

        view = {
            "id": id,
            "title": title,
            "provider": prov_id
        }

        datasets[id] = view

# load the journals

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Journal":
        if "dct:identifier" in elem:
            issn = elem["dct:identifier"]["@value"]
        else:
            issn = ""

        view = {
            "id": id,
            "title": title,
            "issn": issn
        }

        journals[id] = view

# load the publications

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "ResearchPublication":
        dat_list = []
        c = elem["cito:citesAsDataSource"]

        if isinstance(c, dict):
            c = [c]
            
        for d in c:
            dat_id = d["@id"].split("#")[1]
            datasets[dat_id]["used"] = True
            dat_list.append(dat_id)

            prov_id = datasets[dat_id]["provider"]
            providers[prov_id]["used"] = True
                
        if "dct:identifier" in elem:
            doi = elem["dct:identifier"]["@value"]
        else:
            doi = ""

        if "dct:publisher" in elem:
            jour_id = elem["dct:publisher"]["@id"].split("#")[1]
            journals[jour_id]["used"] = True
        else:
            journal = None

        view = {
            "id": id,
            "title": title,
            "doi": doi,
            "journal": jour_id,
            "datasets": dat_list
        }

        publications[id] = view

# report summary stats

print(f"{len(publications)} publications")
print(f"{len(journals)} journals")
print(f"{len(providers)} providers")
print(f"{len(datasets)} datasets")

3238 publications
1009 journals
287 providers
359 datasets


Calculate graph analytics…

In [6]:
import networkx as nx

nxg = nx.Graph()

for p in providers.values():
    if "used" in p:
        nxg.add_node(get_id(p["id"]))

for d in datasets.values():
    if "used" in d:
        nxg.add_node(get_id(d["id"]))
        nxg.add_edge(get_id(d["id"]), get_id(d["provider"]))

for j in journals.values():
    if "used" in j:
        nxg.add_node(get_id(j["id"]))

for p in publications.values():
    nxg.add_node(get_id(p["id"]))

    if p["journal"]:
        nxg.add_edge(get_id(p["id"]), get_id(p["journal"]))

    for d in p["datasets"]:
        nxg.add_edge(get_id(p["id"]), get_id(d))
    
#graph.add_edge(node0, node1, weight=self.edge_weight)
#graph.edge_betweenness_centrality

Run quantile analysis on he centrality results, to assess the relative impact of each element in the KG…

In [7]:
import numpy as np
import pandas as pd

def calc_quantiles (metrics, num_q):
    """ calculate `num` quantiles for the given list                                                                             
    """
    bins = np.linspace(0, 1, num=num_q, endpoint=True)
    s = pd.Series(metrics)
    q = s.quantile(bins, interpolation="nearest")

    try:
        dig = np.digitize(metrics, q) - 1
    except ValueError as e:
        print("ValueError:", str(e), metrics, s, q, bins)
        sys.exit(-1)

    quantiles = []

    for idx, q_hi in q.iteritems():
        quantiles.append(q_hi)

    return quantiles

In [8]:
from operator import itemgetter
from scipy.stats import percentileofscore

result = nx.pagerank(nxg)
#result = nx.edge_betweenness_centrality(nxg)
ranks = list(result.values())

quant = calc_quantiles(ranks, num_q=10)
num_quant = len(quant)

In [9]:
nxg_set = set([])

for n in nxg.nodes:
    nxg_set.add(int(n))

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    if id not in nxg_set:
        print(f"{id} not in nxg_set")
    if id not in LABELS:
        print(f"{IDS[id]} not in LABELS")

In [10]:
IDS_SCALE = {}

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    impact = percentileofscore(ranks, rank)
    scale = (((impact / num_quant) + 5) * 2)
    IDS_SCALE[id] = [int(round(scale)), impact / 100.0]
    print("{:^5}\t{:.4f}\t{:.4f}\t{}".format(id, rank, impact / 100.0, LABELS[id]))

 587 	0.0292	1.0000	Women, Infants, and Children
2908 	0.0266	0.9998	Supplemental Nutrition Assistance Program
4883 	0.0251	0.9996	unknown
2701 	0.0182	0.9994	National Health and Nutrition Examination Survey
1730 	0.0089	0.9992	Food Security Survey Module
2382 	0.0086	0.9990	Current Population Survey Food Security Supplement
1357 	0.0061	0.9987	National Longitudinal Study of Adolescent to Adult Health
1973 	0.0054	0.9985	IRI Infoscan
 782 	0.0053	0.9983	SSRN Electronic Journal
1611 	0.0052	0.9981	J Nutr Educ Behav
2657 	0.0048	0.9979	Microdatabase Direct investment
3982 	0.0036	0.9977	Survey of Earned Doctorates
3034 	0.0031	0.9975	Public Health Nutr
2938 	0.0030	0.9973	PloS ONE
3929 	0.0030	0.9971	Monthly balance sheet statistics
 988 	0.0027	0.9969	J Acad Nutr Diet
 805 	0.0027	0.9967	Survey of Doctorate Recipients
3149 	0.0021	0.9964	Higher Education Research and Development Survey
 358 	0.0021	0.9962	Pediatrics
4130 	0.0020	0.9960	J Am Diet Assoc
4104 	0.0019	0.9958	FoodAPS Nationa

 737 	0.0002	0.8627	Gestational age, kindergartenlevel literacy, and effect modification by maternal socioeconomic and demographic factors
1100 	0.0002	0.8627	Life Is a Stage: Autistic Perspectives on Neurotypicality
1136 	0.0002	0.8627	Rethinking the School Closure Research: School Closure as Spatial Injustice
1921 	0.0002	0.8627	Social Validity and Teachers' Use of Evidence-Based Practices for Autism
2719 	0.0002	0.8627	Young Adolescents' Digital Technology Use and Mental Health Symptoms: Little Evidence of Longitudinal or Daily Linkages
3176 	0.0002	0.8627	A case example of one state's efforts to measure and address the critical shortage of school psychologists
3938 	0.0002	0.8627	Schooling 'Truant' Tribes: British Colonial Compulsions and Educational Evolution in Chhotanagpur, 1870-1930
4195 	0.0002	0.8627	Understanding literacy adoption policies across contexts: a multi-state examination of literacy curriculum decision-making
4296 	0.0002	0.8627	Child Maltreatment Knowledge and Re

4415 	0.0002	0.7442	Modeling safe infant sleep in the hospital
4444 	0.0002	0.7442	Knowledge and practices of Ohio nurse practitioners regarding food access of patients
4579 	0.0002	0.7442	Going outside the neighborhood: The shopping patterns and adaptations of disadvantaged consumers living in the lower eastside neighborhoods of Detroit, Michigan
4649 	0.0002	0.7442	Women from racial or ethnic minority and low socioeconomic backgrounds receive more prenatal education: Results from the 2012 to 2014 Pregnancy Risk Assessment Monitoring System
4658 	0.0002	0.7442	Food safety education attitude and practice among health professionals in China, Peru, and the U.S.
4660 	0.0002	0.7442	Dietary standards and future developments
4701 	0.0002	0.7442	Dietary Assimilation among Mexican Children in Immigrant Households: Code-switching and Healthy Eating across Social Institutions
4720 	0.0002	0.7442	14 Case Study of a National Supplementary Feeding Program: The WIC Program in the United States**Wit

1861 	0.0001	0.5873	Chapter Ten. The Growing Trend of Farmers' Markets in the United States (6-10)
2469 	0.0001	0.5873	Impact of Participation in Home-Delivered Meals on Nutrient Intake, Dietary Patterns, and Food Insecurity of Older Persons in New York State
2492 	0.0001	0.5873	Food Insecurity, Food and Nutrition Programs, and Aging: Experiences from Georgia
2584 	0.0001	0.5873	Are Our Babies Hungry? Food Insecurity Among Infants in Urban Clinics
2777 	0.0001	0.5873	Immunizations
3029 	0.0001	0.5873	Exploring Digital Ecosystems, Organizational and Human Challenges
3218 	0.0001	0.5873	Prenatal Nutrition: A Practical Guide for Assessment and Counseling
3415 	0.0001	0.5873	Risk Factors for Low Birth Weight in New York State Counties
3516 	0.0001	0.5873	Ecological Approaches to Creating Healthy Local Food Environments in the United States: Push and Pull Forces
3627 	0.0001	0.5873	Special Supplemental Nutrition Program for Women, Infants and Children (WIC): non-discretionary funding provis

3511 	0.0001	0.4308	Investment dispersion and the business cycle
1029 	0.0001	0.4295	Widow(er) Poverty and Out-of-Pocket Medical Expenditures Near the End of Life
1033 	0.0001	0.4295	Worker Adaptation and Employer Accommodation Following the Onset of a Health Impairment
1159 	0.0001	0.4295	Parent care and the stress process: Findings from panel data
1525 	0.0001	0.4295	Baseline health, socioeconomic status, and 10-year mortality among older middle-aged Americans: Findings from the Health and Retirement Study, 1992-2002
2181 	0.0001	0.4295	The Significance of Nonmarital Cohabitation: Marital Status and Mental Health Benefits among Middle-Aged and Older Adults
2581 	0.0001	0.4295	Honeymoons and Joint Lunches: Effects of Retirement and Spouse's Employment on Depressive Symptoms
3157 	0.0001	0.4295	Trends in Scores on Tests of Cognitive Ability in the Elderly U.S. Population, 1993-2000
3341 	0.0001	0.4295	Disparities Among Older Adults in Measures of Cognitive Function by Race or Ethnicity

4719 	0.0001	0.2893	Food cost is the least of my worries: a qualitative study exploring food and beverage purchasing decisions among parents enrolled in the WIC program
 374 	0.0001	0.2876	Examining Food Store Scanner Data: A Comparison of the IRI InfoSCan Data with Other Data Sets, 2008-2012
 572 	0.0001	0.2876	How Much Does It Matter How Sick You Get? Consumers' Responses to Foodborne Disease Outbreaks of Different Severities
1123 	0.0001	0.2876	America's Eating Habits: Food Away From Home
1348 	0.0001	0.2876	Interstate Variation in WIC Food Package Costs: The Role of Food Prices, Caseload Composition, and Cost-Containment Practices
1489 	0.0001	0.2876	Infant Formula Prices and Availability: Final Report to Congress
1953 	0.0001	0.2876	Measuring the Impacts of Off-Season Berry Imports
2618 	0.0001	0.2876	The Cost of Satisfying Fruit and Vegetable Recommendations in the Dietary Guidelines
3694 	0.0001	0.2876	Infant Formula Prices and Availability: An Interim Report to Congress
4324 	0

3640 	0.0001	0.1091	Int J Offender Ther Comp Criminol
3728 	0.0001	0.1091	J Crim Justice
 272 	0.0001	0.1077	Abstracts in Anthropology
1021 	0.0001	0.1077	Wohlfahrtsstaat — Transformation und Perspektiven
1821 	0.0001	0.1077	Eire Irel
2065 	0.0001	0.1077	Contemp Drug Probl
2930 	0.0001	0.1077	Routledge
3021 	0.0001	0.1077	J Econ Asymmetries
3799 	0.0001	0.1077	Internalizing Globalization
4509 	0.0001	0.1077	The Countryside in the Age of the Modern State
4653 	0.0001	0.1077	International Political Economy Series
 718 	0.0001	0.1060	Lancet Public Health
1122 	0.0001	0.1060	Homicide Stud
1346 	0.0001	0.1060	Rev Econ Dyn
1990 	0.0001	0.1060	2019 Spring Simulation Conference (SpringSim)
2089 	0.0001	0.1060	World J Urol
2287 	0.0001	0.1060	J Crime Justice
2614 	0.0001	0.1060	Soc Leg Stud
3131 	0.0001	0.1060	Int Crim Justice Rev
1700 	0.0001	0.1050	Boston Police Department
1059 	0.0001	0.1048	Washington State Department of Corrections
 37  	0.0001	0.1032	Nat. Biotechnol.
 138 	0.0001	0.1032	S

Use the `constrain()` function to constrain the graph to the neighborhood of a specified node. This is based on a breadth-first search, with a `limit` parameter to constrain the diameter of the neighborhood in the graph.

In [11]:
SUBGRAPH = nxg_set

def constrain (limit, search_term):
    global SUBGRAPH
    
    for node_id, label in LABELS.items():
        if label == search_term:
            r = nx.bfs_edges(nxg, source=node_id, depth_limit=limit)
            SUBGRAPH = set([node_id])

            for _, neighbor in r:
                SUBGRAPH.add(neighbor)


#constrain(limit=4, search_term="NOAA")
print(len(SUBGRAPH))

4780


Generate an interactive visualization…

In [12]:
from pyvis.network import Network

g = Network(notebook=True, height="1000px", width="100%")
g.force_atlas_2based()

for p in providers.values():
    if "used" in p:
        p_id = get_id(p["id"])
        
        if p_id in SUBGRAPH:
            scale, impact = IDS_SCALE[p_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["ror"])
            g.add_node(p_id, label=p["title"], title=title, color="orange", size=scale)

for d in datasets.values():
    if "used" in d:
        d_id = get_id(d["id"])
        
        if d_id in SUBGRAPH:
            p_id = get_id(d["provider"])
            scale, impact = IDS_SCALE[d_id]
            title = "{}<br/>rank: {:.4f}<br/>provider: {}".format(d["title"], impact, LABELS[p_id])
            g.add_node(d_id, label=d["title"], title=title, color="red", size=scale)

            if p_id in SUBGRAPH:
                g.add_edge(d_id, p_id, color="gray")

for j in journals.values():
    if "used" in j:
        j_id = get_id(j["id"])

        if j_id in SUBGRAPH:
            scale, impact = IDS_SCALE[j_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(j["title"], impact, j["issn"])
            g.add_node(j_id, label=j["title"], title=title, color="green", size=scale)

for p in publications.values():
    p_id = get_id(p["id"])

    if p_id in SUBGRAPH:
        scale, impact = IDS_SCALE[p_id]
        title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["doi"])
        g.add_node(p_id, label=p["title"], title=title, color="blue", size=scale)

        if p["journal"]:
            j_id = get_id(p["journal"])

            if j_id in SUBGRAPH:
                g.add_edge(p_id, j_id, color="gray")

        for d in p["datasets"]:
            d_id = get_id(d)
            
            if d_id in SUBGRAPH:
                g.add_edge(p_id, d_id, color="gray")

g.show_buttons()
g.show("corpus.html")