# Rich Context: Knowledge Graph Visualization

This notebook loads the Rich Context knowledge graph from the `tmp.jsonld` JSON-LD file prepared by the `RCGraph` workflow.
It runs graph analytics on the KG using the `NetworkX` library, then creates an interactive visualization using the `PyVis` library.

The following installations are needed, if these libraries haven't already been installed:

In [None]:
!pip install pyvis
!pip install networkx
!pip install pandas
!pip install numpy

Load the KG from the `tmp.jsonld` file…

In [1]:
import json
import sys

IDS = []
LABELS = {}

publications = {}
providers = {}
datasets = {}
journals = {}
authors = {}


def get_id (id):
    """ lookup the numeric ID for an element
    """
    global IDS
    return int(IDS.index(id))


def parse_metadata (elem):
    """ parse the required metadata items from one element in the graph
    """
    global IDS, LABELS
    
    kind = elem["@type"]
    
    #print(elem["dct:title"])
    title = elem["dct:title"]["@value"]

    id = elem["@id"].split("#")[1]
    IDS.append(id)
    LABELS[get_id(id)] = title

    return id, kind, title


# input the corpus from the JSON-LD file

filename = "tmp.jsonld"

with open(filename, "r") as f:
    jld_corpus = json.load(f)
    corpus = jld_corpus["@graph"]
    
# report summary stats

print(f"{len(corpus)} corpus elements")

6712 corpus elements


In [2]:
# load the providers

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Provider":
        if "dct:identifier" in elem:
            ror = elem["dct:identifier"]["@value"]
        else:
            ror = ""

        view = {
            "id": id,
            "title": title,
            "ror": ror
        }

        providers[id] = view

# load the datasets

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Dataset":
        prov_id = elem["dct:publisher"]["@value"]

        view = {
            "id": id,
            "title": title,
            "provider": prov_id
        }

        datasets[id] = view

# load the journals

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Journal":
        if "dct:identifier" in elem:
            issn = elem["dct:identifier"]["@value"]
        else:
            issn = ""

        view = {
            "id": id,
            "title": title,
            "issn": issn
        }

        journals[id] = view

# load the authors

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Author":
        if "dct:identifier" in elem:
            orcid = elem["dct:identifier"]["@value"]
        else:
            orcid = ""

        view = {
            "id": id,
            "title": title,
            "orcid": orcid
        }

        authors[id] = view

# load the publications

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "ResearchPublication":
        # link the datasets
        data_list = []
        l = elem["cito:citesAsDataSource"]

        if isinstance(l, dict):
            l = [l]
            
        for d in l:
            data_id = d["@id"].split("#")[1]
            datasets[data_id]["used"] = True
            data_list.append(data_id)

            prov_id = datasets[data_id]["provider"]
            providers[prov_id]["used"] = True

        # link the authors
        auth_list = []
        
        if "dct:creator" in elem:
            l = elem["dct:creator"]
        else:
            l = []

        if isinstance(l, dict):
            l = [l]

        for a in l:
            auth_id = a["@id"].split("#")[1]
            authors[auth_id]["used"] = True
            auth_list.append(auth_id)

        # add DOI
        if "dct:identifier" in elem:
            doi = elem["dct:identifier"]["@value"]
        else:
            doi = ""

        if "dct:publisher" in elem:
            jour_id = elem["dct:publisher"]["@id"].split("#")[1]
            journals[jour_id]["used"] = True
        else:
            journal = None

        view = {
            "id": id,
            "title": title,
            "doi": doi,
            "journal": jour_id,
            "datasets": data_list,
            "authors": auth_list
        }

        publications[id] = view

# report summary stats

print(f"{len(publications)} publications")
print(f"{len(journals)} journals")
print(f"{len(providers)} providers")
print(f"{len(datasets)} datasets")
print(f"{len(authors)} authors")

1604 publications
629 journals
310 providers
208 datasets
3961 authors


Calculate graph analytics…

In [3]:
import networkx as nx

nxg = nx.Graph()

for p in providers.values():
    if "used" in p:
        nxg.add_node(get_id(p["id"]))

for d in datasets.values():
    if "used" in d:
        nxg.add_node(get_id(d["id"]))
        nxg.add_edge(get_id(d["id"]), get_id(d["provider"]))

for a in authors.values():
    if "used" in a:
        nxg.add_node(get_id(a["id"]))

for j in journals.values():
    if "used" in j:
        nxg.add_node(get_id(j["id"]))

for p in publications.values():
    nxg.add_node(get_id(p["id"]))

    if p["journal"]:
        nxg.add_edge(get_id(p["id"]), get_id(p["journal"]))

    for d in p["datasets"]:
        nxg.add_edge(get_id(p["id"]), get_id(d))

    for a in p["authors"]:
        nxg.add_edge(get_id(p["id"]), get_id(a))
    
#graph.add_edge(node0, node1, weight=self.edge_weight)
#graph.edge_betweenness_centrality

Run quantile analysis on he centrality results, to assess the relative impact of each element in the KG…

In [4]:
import numpy as np
import pandas as pd

def calc_quantiles (metrics, num_q):
    """ calculate `num` quantiles for the given list                                                                             
    """
    bins = np.linspace(0, 1, num=num_q, endpoint=True)
    s = pd.Series(metrics)
    q = s.quantile(bins, interpolation="nearest")

    try:
        dig = np.digitize(metrics, q) - 1
    except ValueError as e:
        print("ValueError:", str(e), metrics, s, q, bins)
        sys.exit(-1)

    quantiles = []

    for idx, q_hi in q.iteritems():
        quantiles.append(q_hi)

    return quantiles

In [5]:
from operator import itemgetter
from scipy.stats import percentileofscore

result = nx.pagerank(nxg)
#result = nx.edge_betweenness_centrality(nxg)
ranks = list(result.values())

quant = calc_quantiles(ranks, num_q=10)
num_quant = len(quant)

In [6]:
nxg_set = set([])

for n in nxg.nodes:
    nxg_set.add(int(n))

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    if id not in nxg_set:
        print(f"{id} not in nxg_set")
    if id not in LABELS:
        print(f"{IDS[id]} not in LABELS")

In [7]:
IDS_SCALE = {}
SCALE_FACTOR = 3

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    impact = percentileofscore(ranks, rank)
    scale = (((impact / num_quant) + 5) * SCALE_FACTOR)
    IDS_SCALE[id] = [int(round(scale)), impact / 100.0]
    print("{:^5}\t{:.4f}\t{:.4f}\t{}".format(id, rank, impact / 100.0, LABELS[id]))

1506 	0.0129	1.0000	National Health and Nutrition Examination Survey
5833 	0.0099	0.9998	Supplemental Nutrition Assistance Program
3870 	0.0085	0.9997	Women, Infants, and Children
 351 	0.0051	0.9995	Dietary outcomes within the study of novel approaches to weight gain prevention (SNAP) randomized controlled trial
3017 	0.0046	0.9994	National Longitudinal Study of Adolescent to Adult Health
1716 	0.0046	0.9992	Food Security Survey Module
 272 	0.0033	0.9991	Towards a more reliable historical reanalysis: Improvements for version 3 of the Twentieth Century Reanalysis system
 878 	0.0025	0.9989	PloS ONE
3712 	0.0025	0.9988	SSRN Electronic Journal
1390 	0.0024	0.9986	Public Health Nutr
4719 	0.0024	0.9985	Potentially Modifiable Determinants of Malnutrition in Older Adults: a Systematic Review
3528 	0.0020	0.9983	Accuracy and Precision of Tidal Wetland Soil Carbon Mapping in the Conterminous United States
5168 	0.0019	0.9982	Current Population Survey Food Security Supplement
4100 	0.0018	0.9

1040 	0.0004	0.9315	Impacts of North Pacific subtropical and subarctic oceanic frontal zones on the wintertime atmospheric large-scale circulations
4120 	0.0004	0.9313	Scanner Data-Based Panel Price Indexes
6699 	0.0004	0.9312	Household food insecurity is a risk factor for iron-deficiency anaemia in a multi-ethnic, low-income sample of infants and toddlers
2270 	0.0004	0.9310	Influences of the neighbourhood food environment on adiposity of low-income preschool-aged children in Los Angeles County: a longitudinal study
5406 	0.0004	0.9309	Amount, Preparation and Type of Formula Consumed and Its Association with Weight Gain in Infants Participating in the WIC Program in Hawaii and Puerto Rico
2167 	0.0004	0.9307	Cooking Matters for Adults Improves Food Resource Management Skills and Self-confidence Among Low-Income Participants
2775 	0.0004	0.9306	Food Deserts and the Causes of Nutritional Inequality
6307 	0.0004	0.9304	An Evaluation of the Effects of a Breastfeeding Support Program on He

 625 	0.0003	0.8686	Unconventional Monetary Policy, Bank Lending, and Security Holdings: The Yield-Induced Portfolio Rebalancing Channel
3549 	0.0003	0.8685	The Relationship Between Obesity and Participation in the Supplemental Nutrition Assistance Program (SNAP): Is Mental Health a Mediator?
4787 	0.0003	0.8683	Berkowitz, Seth A.
4222 	0.0003	0.8682	How will Basel II affect bank lending to emerging markets? An analysis based on German bank level data
1421 	0.0003	0.8680	Spatiotemporal models for big multinomial data using the conditional multivariate logitbeta distribution
5494 	0.0003	0.8679	Escaping Low Earnings: The Role of Employer Characteristics and Changes
2111 	0.0003	0.8677	The Estimated Amount, Value, and Calories of Postharvest Food Losses at the Retail and Consumer Levels in the United States
5996 	0.0003	0.8676	Testing functional forms of market share models using the Box-Cox transformation and the Lagrange multiplier approach
1970 	0.0003	0.8674	Land subsidence in Housto

6581 	0.0002	0.8018	Health Justice
2796 	0.0002	0.8017	Efficient, profitable and safe banking: an oxymoron? Evidence from a panel VAR approach
2387 	0.0002	0.8015	Beatty, Timothy K. M.
3732 	0.0002	0.8014	Transient, but Not Persistent, Adult Food Insecurity Influences Toddler Development
1045 	0.0002	0.8012	Testing the Economic Independence Hypothesis: The Effect of an Exogenous Increase in Child Support on Subsequent Marriage and Cohabitation
 488 	0.0002	0.8010	At Your Service! The Role of Tax Havens in International Trade with Services
4148 	0.0002	0.8009	O'Leary, Christopher J.
 981 	0.0002	0.8007	Household Food Insecurity and Psychosocial Dysfunction in Ecuadorian Elementary Schoolchildren
6127 	0.0002	0.8006	Evaluation of Global Fire Weather Database re-analysis and short-term forecast products
4782 	0.0002	0.8004	Food Security and Teenage Labor Supply
2528 	0.0002	0.8003	Bull Am Meteorol Soc
5266 	0.0002	0.8001	Determinants of German FDI: New Evidence from Micro-Data
5703 	0.000

  9  	0.0002	0.7656	Human Services
3298 	0.0002	0.7654	US Department of Housing and Urban Development
5243 	0.0002	0.7652	Nevo, Aviv
2814 	0.0002	0.7651	Reg Sci Urban Econ
2026 	0.0002	0.7649	Climate Resilience Screening Index and Domain Scores
1615 	0.0002	0.7648	Soc Serv Rev
 642 	0.0002	0.7646	Assessor and Real Estate Database
4338 	0.0002	0.7645	Blanck, Heidi Michels
5015 	0.0002	0.7643	J Hum Lact
3108 	0.0002	0.7642	Yuce, Huseyin
5065 	0.0002	0.7640	Finance and growth in a bank-based economy: is it quantity or quality that matters?
 535 	0.0002	0.7639	Zhang, Qianggong
6172 	0.0002	0.7637	Filges, Trine
4297 	0.0002	0.7636	decision support system
4938 	0.0002	0.7634	Business cycles and FDI: evidence from German sectoral data
2891 	0.0002	0.7633	Household debt information
 942 	0.0002	0.7631	Who Goes East? The Impact of Enlargement on the Patterns of German FDI
1859 	0.0002	0.7629	Paternal Incarceration and Children's Food Insecurity: A Consideration of Variation and Mechanisms
5304 

4882 	0.0001	0.7060	AIDS Behav
 289 	0.0001	0.7058	Oregon Unemployment Insurance
6135 	0.0001	0.7056	Rapid Refresh
2063 	0.0001	0.7055	J Higher Educ
 374 	0.0001	0.7053	Sun, Shufang
4224 	0.0001	0.7051	Campos, Maribel
6179 	0.0001	0.7051	Palacios, Cristina
5484 	0.0001	0.7049	Lumeng, Jennifer C. F.
6542 	0.0001	0.7047	J Agric Food Ind Organ
1254 	0.0001	0.7045	Elections to the United States House of Representatives
3736 	0.0001	0.7045	The Anatomical Tracings of Lesions after Stroke
4842 	0.0001	0.7043	Water Use Data for the Nation
 644 	0.0001	0.7041	Ishdorj, Ariun
4727 	0.0001	0.7039	Purtell, Kelly M.
3584 	0.0001	0.7038	J Bus Econ Stat
 114 	0.0001	0.7033	Twin Res Hum Genet
1291 	0.0001	0.7033	J Abnorm Child Psychol
2881 	0.0001	0.7033	Eur J Hum Genet
3563 	0.0001	0.7033	J Eat Disord
3713 	0.0001	0.7033	J Sleep Res
4289 	0.0001	0.7033	Sex Transm Dis
 726 	0.0001	0.7026	Ohio Temporary Assistance for Needy Families
5883 	0.0001	0.7026	State of Ohio Unemployment Insurance
3626 	0.0001	0

2334 	0.0001	0.6489	Serum phosphorus predicts incident chronic kidney disease and end-stage renal disease
2475 	0.0001	0.6489	Sedentary time and cardio-metabolic biomarkers in US adults: NHANES 2003-06
2539 	0.0001	0.6489	Dietary phosphorus intake and mortality in moderate chronic kidney disease: NHANES III
4163 	0.0001	0.6489	Kidney function, albuminuria and age-related macular degeneration in NHANES III
4281 	0.0001	0.6489	Incidence of cytomegalovirus infection among the general population and pregnant women in the United States
4739 	0.0001	0.6489	The burden of hepatitis C in the United States
4991 	0.0001	0.6489	Prevalence and risk factors of work related asthma by industry among United States workers: Data from the Third National Health and Nutrition Examination Survey (1988-94)
5415 	0.0001	0.6489	Urinary mercury concentrations associated with dental restorations in adult women aged 16-49 years: United States, 1999-2000
5512 	0.0001	0.6489	Herpes simplex virus type 2 seropositivi

5105 	0.0001	0.5949	The Significance of Nonmarital Cohabitation: Marital Status and Mental Health Benefits among Middle-Aged and Older Adults
5163 	0.0001	0.5949	Trends in Scores on Tests of Cognitive Ability in the Elderly U.S. Population, 1993-2000
5884 	0.0001	0.5949	Baseline health, socioeconomic status, and 10-year mortality among older middle-aged Americans: Findings from the Health and Retirement Study, 1992-2002
6042 	0.0001	0.5949	Who Expects to Continue Working After Age 62? The Retirement Plans of Couples
6513 	0.0001	0.5949	Worker Adaptation and Employer Accommodation Following the Onset of a Health Impairment
6624 	0.0001	0.5949	Disparities Among Older Adults in Measures of Cognitive Function by Race or Ethnicity
1108 	0.0001	0.5938	Samaila, Dominic
2590 	0.0001	0.5938	Chukwuemeka, Emeka Joshua
 965 	0.0001	0.5938	Contemp Educ Technol
2126 	0.0001	0.5934	Bachmann, Ruediger
3170 	0.0001	0.5934	Bayer, Cynthia Taft
1257 	0.0001	0.5932	Buettner, Thiess
5517 	0.0001	0.5930	Vill

6677 	0.0001	0.5278	García, Ofelia
5894 	0.0001	0.5278	Applied Linguistics Review
 315 	0.0001	0.5263	Saykin, Andrew J.
 737 	0.0001	0.5263	Pesini, Pedro
1124 	0.0001	0.5263	Fandos, Noelia
1531 	0.0001	0.5263	Tenenbaum, Alexander
1777 	0.0001	0.5263	Berkovich, Anat
2088 	0.0001	0.5263	Sherriff, Ian
3164 	0.0001	0.5263	Risacher, Shannon L.
3571 	0.0001	0.5263	Shlomai, Gadi
3943 	0.0001	0.5263	Romero, Judith
4690 	0.0001	0.5263	Geva, Mika
5019 	0.0001	0.5263	Grossman, Ehud
5068 	0.0001	0.5263	Maor, Elad
5805 	0.0001	0.5263	Apostolova, Liana G.
6340 	0.0001	0.5263	Leibowitz, Avshalom
3754 	0.0001	0.5263	Alzheimers Dement (Amst)
4779 	0.0001	0.5263	Cardiovasc Diabetol
1218 	0.0001	0.5247	Gates, Gary J.
2283 	0.0001	0.5247	Cutler, Christopher
5719 	0.0001	0.5247	Luallen, Jeremy Clayton
6455 	0.0001	0.5247	Int Crim Justice Rev
 712 	0.0001	0.5234	Desmet, Peter
1042 	0.0001	0.5234	Verlinden, Liesbeth
1127 	0.0001	0.5234	Leijnse, Hidde
1132 	0.0001	0.5234	Veen, Lourens
1379 	0.0001	0.5234	van 

1173 	0.0001	0.4575	Smith, Lisa M.
2286 	0.0001	0.4575	McLaughlin, Michelle
2909 	0.0001	0.4575	Bousquin, Justin
3267 	0.0001	0.4575	Harvey, James E.
3296 	0.0001	0.4575	Harwell, Linda C.
4914 	0.0001	0.4575	Buck, Kyle D.
5342 	0.0001	0.4575	Summers, James Kevin
 264 	0.0001	0.4568	Warren, E. Johnson
4037 	0.0001	0.4568	Curtis, Marah A.
3169 	0.0001	0.4568	Hous Stud
1910 	0.0001	0.4561	Sim, Lester
3760 	0.0001	0.4561	Graham, Jim
4040 	0.0001	0.4561	Grubesic, Tony H.
6006 	0.0001	0.4561	Nelson, Jake R.
6458 	0.0001	0.4561	Rose, Kelly
 169 	0.0001	0.4554	Gapor, Salfarina Abdul
1178 	0.0001	0.4554	Abdul Gapor, Salfarina
1227 	0.0001	0.4554	Masron, Tarmiji
2459 	0.0001	0.4554	Ismail, Norhasimah
4180 	0.0001	0.4554	Academic Journal of Interdisciplinary Studies
 244 	0.0001	0.4542	Markham, Christine M
1141 	0.0001	0.4542	Oceguera, Amanda
2094 	0.0001	0.4542	Farhat, Alicia Elena
3219 	0.0001	0.4542	Sharma, Shreela V
3954 	0.0001	0.4542	Patlovich, Krista
4924 	0.0001	0.4542	Bounds, Gregory W
6

6329 	0.0001	0.3855	Witt, Emitt C.
5673 	0.0001	0.3855	AIMS Environ Sci
4465 	0.0001	0.3851	Toledano-Toledano, Filiberto
6008 	0.0001	0.3851	Domínguez-Guedea, Miriam Teresa
 990 	0.0001	0.3845	Nance, Nerissa
1119 	0.0001	0.3845	Sosa-Rubí, Sandra G.
1199 	0.0001	0.3845	Salas-Ortiz, Andrea
4681 	0.0001	0.3845	Bautista-Arredondo, Sergio
6157 	0.0001	0.3845	La Hera-Fuentes, Gina
 206 	0.0001	0.3825	Sleep
 995 	0.0001	0.3825	Health (Irvine Calif)
1154 	0.0001	0.3825	BMC Pregnancy Childbirth
1265 	0.0001	0.3825	J Youth Adolesc
1280 	0.0001	0.3825	Nurs Res Pract
1981 	0.0001	0.3825	Alcohol Clin Exp Res
2285 	0.0001	0.3825	BMC Genet
2330 	0.0001	0.3825	PLoS Genet
2351 	0.0001	0.3825	Addiction
2486 	0.0001	0.3825	PeerJ
2834 	0.0001	0.3825	Bull World Health Organ
3341 	0.0001	0.3825	Inj Epidemiol
3499 	0.0001	0.3825	Behav Genet
3950 	0.0001	0.3825	Am J Drug Alcohol Abuse
4277 	0.0001	0.3825	Journal of the Royal Statistical Society Series A
4407 	0.0001	0.3825	The Journal of Law, Economics, and O

2128 	0.0001	0.3078	Bonafede, Machaon M K
3577 	0.0001	0.3078	Gandra, Shravanthi R.
4836 	0.0001	0.3078	Johnson, Barbara H.
 158 	0.0001	0.3064	McNish, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
 304 	0.0001	0.3064	Moran, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
1212 	0.0001	0.3064	Winkel, Gary
1920 	0.0001	0.3064	Crist, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
2694 	0.0001	0.3064	Ramirez, Julia
3573 	0.0001	0.3064	Winkel, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
3856 	0.0001	0.3064	Crist, Michael
3981 	0.0001	0.3064	Gany, Francesca
4528 	0.0001	0.3064	Massie, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
5356 	0.0001	0.3064	Ramirez, Francesca
6357 	0.0001	0.3064	Lee, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
6384 	0.0001	0.3064	McNish, Thelma
6436 	0.0001	0.3064	Leng, Francesca Trevor Julia Dana Alyssa Michael Thelma Gary Jenni
6664 	0.0001	0.3064	Massie, Dana
2735 	0.0001	0

6204 	0.0001	0.2281	Davern, Michael J.
2699 	0.0001	0.2278	Kutzbach, Mark J.
3407 	0.0001	0.2278	Hellerstein, Judith K.
 795 	0.0001	0.2274	Abdullah, Qassim A.
1856 	0.0001	0.2274	Nayegandhi, Amar
5221 	0.0001	0.2274	Winehouse, Jayna
1214 	0.0001	0.2269	Kino, Shiho
5706 	0.0001	0.2269	Sato, Koryu
3045 	0.0001	0.2269	Int J Equity Health
2002 	0.0001	0.2266	Craigie, Terry‐Ann
 595 	0.0001	0.2263	Pejavara, Anu
6317 	0.0001	0.2263	Wright, Demia S.
 778 	0.0001	0.2263	J Public Health Manag Pract
3375 	0.0001	0.2259	Long, Peter V
5921 	0.0001	0.2259	Garber, Judith M.
 824 	0.0001	0.2256	Peterson, Everett B.
2360 	0.0001	0.2256	Int J Ind Organ
2499 	0.0001	0.2254	New York State Department of Labor
1272 	0.0001	0.2249	Chapman, Gwen E.
3843 	0.0001	0.2249	Veenstra, Gerry
4731 	0.0001	0.2249	Black, Jennifer L.
5313 	0.0001	0.2249	Velazquez, Cayley E.
5687 	0.0001	0.2249	Ahmadi, Naseam
4919 	0.0001	0.2244	Susann, Sieber
4290 	0.0001	0.2244	Journal of Statistics Sweden
 890 	0.0001	0.2241	Fricke, 

6480 	0.0001	0.1497	Parnell, Winsome R.
3618 	0.0001	0.1493	Baicker, Katherine
5277 	0.0001	0.1493	Finkelstein, Amy
5475 	0.0001	0.1493	Taubman, Sarah
1256 	0.0001	0.1487	Teran-Garcia, Margarita
1509 	0.0001	0.1487	Musaad, Salma M. A.
1760 	0.0001	0.1487	Fiese, Barbara H.
1948 	0.0001	0.1487	Donovan, Sharon M.
3404 	0.0001	0.1487	Bost, Kelly K.
1832 	0.0001	0.1482	Costanigro, Marco
6648 	0.0001	0.1482	Agriculture and Agricultural Science Procedia
4126 	0.0001	0.1477	Ren, Xuejuan
4601 	0.0001	0.1477	Yang, Xiu-Qun
5343 	0.0001	0.1477	Hu, Haibo
 740 	0.0001	0.1477	J Clim
3276 	0.0001	0.1473	Rigby, Elizabeth A.
3511 	0.0001	0.1473	Kimbro, Rachel Tolbert
1075 	0.0001	0.1468	Leibtag, Ephraim S.
2818 	0.0001	0.1468	Karns, Shawn A.
4140 	0.0001	0.1468	Zhen, Cheng
4341 	0.0001	0.1468	Finkelstein, Eric Andrew
2757 	0.0001	0.1463	Marx, Benjamin M.
4044 	0.0001	0.1463	Turner, Lesley J.
3036 	0.0001	0.1460	HACKETHAL, ANDREAS
4918 	0.0001	0.1460	KARABULUT, YIGITCAN
 416 	0.0001	0.1454	Kruse, Catheri

3469 	0.0001	0.0610	Langellier, Brent A.
4796 	0.0001	0.0610	Glik, Deborah C.
6386 	0.0001	0.0610	Brookmeyer, Ron
2423 	0.0001	0.0606	Moretti, E. Horstman
5815 	0.0001	0.0606	Lochner, Lance J.
 568 	0.0001	0.0601	Çayır, Ebru
1427 	0.0001	0.0601	Meade, Randa L.
4405 	0.0001	0.0601	Cay r, E.
6301 	0.0001	0.0601	Hartline-Grafton, Heather L.
 30  	0.0001	0.0598	Campbell Systematic Reviews
 57  	0.0001	0.0592	Pezzia, Carla
3557 	0.0001	0.0592	Cole, Joanne
4200 	0.0001	0.0592	Cuate, Erica L.
6413 	0.0001	0.0592	Quirk, Lisa
2267 	0.0001	0.0592	Clin Transl Sci
4211 	0.0001	0.0592	Ann Anthropol Pract
 956 	0.0001	0.0587	Maryland Department of Labor
 151 	0.0001	0.0580	Cotton, William H.
1349 	0.0001	0.0580	Granado-Villar, Deise C.
2135 	0.0001	0.0580	Chilton, Lance A.
2309 	0.0001	0.0580	Paz-Soldán, Gonzalo J.
2628 	0.0001	0.0580	Brown, Jeffrey M.
2837 	0.0001	0.0580	Gambon, Thresia B.
5283 	0.0001	0.0580	Gorski, Peter A.
6550 	0.0001	0.0580	Zind, Barbara
3737 	0.0001	0.0573	Holben, David H.
 8

Use the `constrain()` function to constrain the graph to the neighborhood of a specified node. This is based on a breadth-first search, with a `limit` parameter to constrain the diameter of the neighborhood in the graph.

In [20]:
SUBGRAPH = nxg_set

def constrain (limit, search_term):
    global SUBGRAPH
    
    for node_id, label in LABELS.items():
        if label == search_term:
            r = nx.bfs_edges(nxg, source=node_id, depth_limit=limit)
            SUBGRAPH = set([node_id])

            for _, neighbor in r:
                SUBGRAPH.add(neighbor)


#constrain(limit=4, search_term="NOAA")
print(len(SUBGRAPH))

6509


In [21]:
from IPython.core.display import display, Markdown

frags = []
markdown_frag = " - ![#{}](https://placehold.it/15/{}/000000?text=+) `{} {}`"

num_prov = len(SUBGRAPH.intersection(set([get_id(p) for p in providers.keys()])))
frags.append(markdown_frag.format("ffa500", "ffa500", num_prov, "providers (orange)"))

num_data = len(SUBGRAPH.intersection(set([get_id(d) for d in datasets.keys()])))
frags.append(markdown_frag.format("ff0000", "ff0000", num_data, "datasets (red)"))

num_auth = len(SUBGRAPH.intersection(set([get_id(a) for a in authors.keys()])))
frags.append(markdown_frag.format("ff00ff", "ff00ff", num_auth, "authors (purple)"))

num_jour = len(SUBGRAPH.intersection(set([get_id(j) for j in journals.keys()])))
frags.append(markdown_frag.format("008000", "008000", num_jour, "journals (green)"))

num_pubs = len(SUBGRAPH.intersection(set([get_id(p) for p in publications.keys()])))
frags.append(markdown_frag.format("0000ff", "0000ff", num_pubs, "publications (blue)"))

LEGEND_MARKDOWN = "\n".join(frags)
print(LEGEND_MARKDOWN)

 - ![#ffa500](https://placehold.it/15/ffa500/000000?text=+) `107 providers (orange)`
 - ![#ff0000](https://placehold.it/15/ff0000/000000?text=+) `208 datasets (red)`
 - ![#ff00ff](https://placehold.it/15/ff00ff/000000?text=+) `3961 authors (purple)`
 - ![#008000](https://placehold.it/15/008000/000000?text=+) `629 journals (green)`
 - ![#0000ff](https://placehold.it/15/0000ff/000000?text=+) `1604 publications (blue)`


Generate an interactive visualization…

In [22]:
from pyvis.network import Network

g = Network(notebook=True, height="1000px", width="100%")
g.force_atlas_2based()

for p in providers.values():
    if "used" in p:
        p_id = get_id(p["id"])
        
        if p_id in SUBGRAPH:
            scale, impact = IDS_SCALE[p_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["ror"])
            g.add_node(p_id, label=p["title"], title=title, color="orange", size=scale)

for d in datasets.values():
    if "used" in d:
        d_id = get_id(d["id"])
        
        if d_id in SUBGRAPH:
            p_id = get_id(d["provider"])
            scale, impact = IDS_SCALE[d_id]
            title = "{}<br/>rank: {:.4f}<br/>provider: {}".format(d["title"], impact, LABELS[p_id])
            g.add_node(d_id, label=d["title"], title=title, color="red", size=scale)

            if p_id in SUBGRAPH:
                g.add_edge(d_id, p_id, color="gray")

for a in authors.values():
    if "used" in a:
        a_id = get_id(a["id"])

        if a_id in SUBGRAPH:
            scale, impact = IDS_SCALE[a_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(a["title"], impact, a["orcid"])
            g.add_node(a_id, label=a["title"], title=title, color="purple", size=scale)

for j in journals.values():
    if "used" in j:
        j_id = get_id(j["id"])

        if j_id in SUBGRAPH:
            scale, impact = IDS_SCALE[j_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(j["title"], impact, j["issn"])
            g.add_node(j_id, label=j["title"], title=title, color="green", size=scale)

for p in publications.values():
    p_id = get_id(p["id"])

    if p_id in SUBGRAPH:
        scale, impact = IDS_SCALE[p_id]
        title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["doi"])
        g.add_node(p_id, label=p["title"], title=title, color="blue", size=scale)

        if p["journal"]:
            j_id = get_id(p["journal"])

            if j_id in SUBGRAPH:
                g.add_edge(p_id, j_id, color="gray")

        for d in p["datasets"]:
            d_id = get_id(d)
            
            if d_id in SUBGRAPH:
                g.add_edge(p_id, d_id, color="gray")

        for a in p["authors"]:
            a_id = get_id(a)
            
            if a_id in SUBGRAPH:
                g.add_edge(p_id, a_id, color="gray")

g.show_buttons()
g.show("corpus.html")

In [23]:
print(LEGEND_MARKDOWN)
display(Markdown(LEGEND_MARKDOWN))

 - ![#ffa500](https://placehold.it/15/ffa500/000000?text=+) `107 providers (orange)`
 - ![#ff0000](https://placehold.it/15/ff0000/000000?text=+) `208 datasets (red)`
 - ![#ff00ff](https://placehold.it/15/ff00ff/000000?text=+) `3961 authors (purple)`
 - ![#008000](https://placehold.it/15/008000/000000?text=+) `629 journals (green)`
 - ![#0000ff](https://placehold.it/15/0000ff/000000?text=+) `1604 publications (blue)`


 - ![#ffa500](https://placehold.it/15/ffa500/000000?text=+) `107 providers (orange)`
 - ![#ff0000](https://placehold.it/15/ff0000/000000?text=+) `208 datasets (red)`
 - ![#ff00ff](https://placehold.it/15/ff00ff/000000?text=+) `3961 authors (purple)`
 - ![#008000](https://placehold.it/15/008000/000000?text=+) `629 journals (green)`
 - ![#0000ff](https://placehold.it/15/0000ff/000000?text=+) `1604 publications (blue)`