# Rich Context: Knowledge Graph Visualization

This notebook loads the Rich Context knowledge graph from the `tmp.jsonld` JSON-LD file prepared by the `RCGraph` workflow.
It runs graph analytics on the KG using the `NetworkX` library, then creates an interactive visualization using the `PyVis` library.

The following installations are needed, if these libraries haven't already been installed:

In [29]:
!pip install pyvis
!pip install networkx
!pip install pandas
!pip install numpy



Load the KG from the `tmp.jsonld` file…

In [1]:
import json
import sys

IDS = []
LABELS = {}

publications = {}
providers = {}
datasets = {}
journals = {}
authors = {}


def get_id (id):
    """ lookup the numeric ID for an element
    """
    global IDS
    return int(IDS.index(id))


def parse_metadata (elem):
    """ parse the required metadata items from one element in the graph
    """
    global IDS, LABELS
    
    kind = elem["@type"]
    
    #print(elem["dct:title"])
    title = elem["dct:title"]["@value"]

    id = elem["@id"].split("#")[1]
    IDS.append(id)
    LABELS[get_id(id)] = title

    return id, kind, title


# input the corpus from the JSON-LD file

filename = "tmp.jsonld"

with open(filename, "r") as f:
    jld_corpus = json.load(f)
    corpus = jld_corpus["@graph"]
    
# report summary stats

print(f"{len(corpus)} corpus elements")

5905 corpus elements


In [2]:
# load the providers

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Provider":
        if "dct:identifier" in elem:
            ror = elem["dct:identifier"]["@value"]
        else:
            ror = ""

        view = {
            "id": id,
            "title": title,
            "ror": ror
        }

        providers[id] = view

# load the datasets

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Dataset":
        prov_id = elem["dct:publisher"]["@value"]

        view = {
            "id": id,
            "title": title,
            "provider": prov_id
        }

        datasets[id] = view

# load the journals

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Journal":
        if "dct:identifier" in elem:
            issn = elem["dct:identifier"]["@value"]
        else:
            issn = ""

        view = {
            "id": id,
            "title": title,
            "issn": issn
        }

        journals[id] = view

# load the authors

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "Author":
        if "dct:identifier" in elem:
            orcid = elem["dct:identifier"]["@value"]
        else:
            orcid = ""

        view = {
            "id": id,
            "title": title,
            "orcid": orcid
        }

        authors[id] = view

# load the publications

for elem in corpus:
    id, kind, title = parse_metadata(elem)

    if kind == "ResearchPublication":
        # link the datasets
        data_list = []
        l = elem["cito:citesAsDataSource"]

        if isinstance(l, dict):
            l = [l]
            
        for d in l:
            data_id = d["@id"].split("#")[1]
            datasets[data_id]["used"] = True
            data_list.append(data_id)

            prov_id = datasets[data_id]["provider"]
            providers[prov_id]["used"] = True

        # link the authors
        auth_list = []
        
        if "dct:creator" in elem:
            l = elem["dct:creator"]
        else:
            l = []

        if isinstance(l, dict):
            l = [l]

        for a in l:
            auth_id = a["@id"].split("#")[1]
            authors[auth_id]["used"] = True
            auth_list.append(auth_id)

        # add DOI
        if "dct:identifier" in elem:
            doi = elem["dct:identifier"]["@value"]
        else:
            doi = ""

        if "dct:publisher" in elem:
            jour_id = elem["dct:publisher"]["@id"].split("#")[1]
            journals[jour_id]["used"] = True
        else:
            journal = None

        view = {
            "id": id,
            "title": title,
            "doi": doi,
            "journal": jour_id,
            "datasets": data_list,
            "authors": auth_list
        }

        publications[id] = view

# report summary stats

print(f"{len(publications)} publications")
print(f"{len(journals)} journals")
print(f"{len(providers)} providers")
print(f"{len(datasets)} datasets")
print(f"{len(authors)} authors")

1491 publications
568 journals
308 providers
175 datasets
3363 authors


Calculate graph analytics…

In [3]:
import networkx as nx

nxg = nx.Graph()

for p in providers.values():
    if "used" in p:
        nxg.add_node(get_id(p["id"]))

for d in datasets.values():
    if "used" in d:
        nxg.add_node(get_id(d["id"]))
        nxg.add_edge(get_id(d["id"]), get_id(d["provider"]))

for a in authors.values():
    if "used" in a:
        nxg.add_node(get_id(a["id"]))

for j in journals.values():
    if "used" in j:
        nxg.add_node(get_id(j["id"]))

for p in publications.values():
    nxg.add_node(get_id(p["id"]))

    if p["journal"]:
        nxg.add_edge(get_id(p["id"]), get_id(p["journal"]))

    for d in p["datasets"]:
        nxg.add_edge(get_id(p["id"]), get_id(d))

    for a in p["authors"]:
        nxg.add_edge(get_id(p["id"]), get_id(a))
    
#graph.add_edge(node0, node1, weight=self.edge_weight)
#graph.edge_betweenness_centrality

Run quantile analysis on he centrality results, to assess the relative impact of each element in the KG…

In [4]:
import numpy as np
import pandas as pd

def calc_quantiles (metrics, num_q):
    """ calculate `num` quantiles for the given list                                                                             
    """
    bins = np.linspace(0, 1, num=num_q, endpoint=True)
    s = pd.Series(metrics)
    q = s.quantile(bins, interpolation="nearest")

    try:
        dig = np.digitize(metrics, q) - 1
    except ValueError as e:
        print("ValueError:", str(e), metrics, s, q, bins)
        sys.exit(-1)

    quantiles = []

    for idx, q_hi in q.iteritems():
        quantiles.append(q_hi)

    return quantiles

In [5]:
from operator import itemgetter
from scipy.stats import percentileofscore

result = nx.pagerank(nxg)
#result = nx.edge_betweenness_centrality(nxg)
ranks = list(result.values())

quant = calc_quantiles(ranks, num_q=10)
num_quant = len(quant)

In [6]:
nxg_set = set([])

for n in nxg.nodes:
    nxg_set.add(int(n))

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    if id not in nxg_set:
        print(f"{id} not in nxg_set")
    if id not in LABELS:
        print(f"{IDS[id]} not in LABELS")

In [12]:
IDS_SCALE = {}
SCALE_FACTOR = 2.5

for id, rank in sorted(result.items(), key=itemgetter(1), reverse=True):
    impact = percentileofscore(ranks, rank)
    scale = (((impact / num_quant) + 5) * SCALE_FACTOR)
    IDS_SCALE[id] = [int(round(scale)), impact / 100.0]
    print("{:^5}\t{:.4f}\t{:.4f}\t{}".format(id, rank, impact / 100.0, LABELS[id]))

1319 	0.0148	1.0000	National Health and Nutrition Examination Survey
1047 	0.0114	0.9998	Supplemental Nutrition Assistance Program
2855 	0.0097	0.9996	Women, Infants, and Children
4883 	0.0059	0.9995	Dietary outcomes within the study of novel approaches to weight gain prevention (SNAP) randomized controlled trial
2598 	0.0053	0.9993	Food Security Survey Module
1643 	0.0053	0.9991	National Longitudinal Study of Adolescent to Adult Health
 719 	0.0029	0.9989	SSRN Electronic Journal
 839 	0.0027	0.9988	Public Health Nutr
4872 	0.0027	0.9986	PloS ONE
4009 	0.0027	0.9984	Potentially Modifiable Determinants of Malnutrition in Older Adults: a Systematic Review
2960 	0.0022	0.9982	Current Population Survey Food Security Supplement
2865 	0.0020	0.9981	Microdatabase Direct investment
4813 	0.0018	0.9979	A comparative analysis of the Early Childhood Environment Rating Scalea'Revised and Early Childhood Environment Rating Scale, Third Edition
2747 	0.0017	0.9977	Monthly balance sheet statistics
15

 35  	0.0004	0.9164	Re-entry experiences of Black men living with HIV/AIDS after release from prison: Intersectionality and implications for care
3441 	0.0004	0.9162	Gender gaps in international research collaboration: a bibliometric approach
 315 	0.0004	0.9160	Employer reasons for failing to report eligible workers' compensation claims in the BLS survey of occupational injuries and illnesses
5542 	0.0004	0.9159	Identifying Barriers and Supports to Breastfeeding in the Workplace Experienced by Mothers in the New Hampshire Special Supplemental Nutrition Program for Women, Infants, and Children Utilizing the Total Worker Health Framework
4215 	0.0004	0.9157	Women in Academic Science: A Changing Landscape
 163 	0.0004	0.9155	Social determinants and lifestyle risk factors only partially explain the higher prevalence of food insecurity among Aboriginal and Torres Strait Islanders in the Australian state of Victoria: a cross-sectional study
3947 	0.0004	0.9153	Addressing the wicked problem 

5476 	0.0002	0.8187	Shanks, Carmen Byker
5348 	0.0002	0.8185	Are School Districts Allocating Resources Equitably? The Every Student Succeeds Act, Teacher Experience Gaps, and Equitable Resource Allocation
5796 	0.0002	0.8184	Frank, Deborah A.
2500 	0.0002	0.8182	Econ Hum Biol
1066 	0.0002	0.8180	Figure a Way: Teenage Mothers' Experiences in Shifting Social and Economic Contexts
4864 	0.0002	0.8178	Business Employment Dynamics Data
5552 	0.0002	0.8177	Do production subsidies have a wage incidence in wind power?
3408 	0.0002	0.8175	Moffitt, Robert A.
4899 	0.0002	0.8173	Strategic pricing in a differentiated product oligopoly model: fluid milk in Boston
 248 	0.0002	0.8171	Measuring effects of SNAP on obesity at the intensive margin
2915 	0.0002	0.8170	Good Jobs and Recidivism
 230 	0.0002	0.8168	Estimating local daytime population density from census and payroll data
3097 	0.0002	0.8166	Market timing, maturity mismatch, and risk management: Evidence from the banking industry
2937 	0.0002

4046 	0.0002	0.7472	Kansas Unemployment Insurance
4463 	0.0002	0.7472	Missouri Unemployment Insurance
 617 	0.0002	0.7468	Ettinger de Cuba, Stephanie
2543 	0.0002	0.7468	de Cuba, Stephanie A Ettinger
3044 	0.0002	0.7465	Wolfenden, Luke
 596 	0.0002	0.7464	Policy Interventions to Address Child Health Disparities: Moving Beyond Health Insurance
5886 	0.0002	0.7462	Kim, Youn Kyoung
2090 	0.0002	0.7460	Tax Influence on Financial Structures of M&As
 197 	0.0002	0.7458	Are Parental Welfare Work Requirements Good for Disadvantaged Children? Evidence From Age of Youngest Child Exemptions
 90  	0.0002	0.7457	Pfingsten, Andreas
4853 	0.0002	0.7455	Wolfe, Barbara L.
4300 	0.0002	0.7453	Building Capacity in Institutional Research and Decision Support in Higher Education
 709 	0.0002	0.7451	Ahmed, Saifuddin
1085 	0.0002	0.7449	Kuo, Alice A.
5772 	0.0002	0.7448	Schnitzer, Michele B.
3278 	0.0002	0.7445	Morenoff, Jeffrey D.
4248 	0.0002	0.7445	Harding, David John
5379 	0.0002	0.7442	Internalizing Glo

2230 	0.0001	0.6454	Andrews, Margaret
 357 	0.0001	0.6451	Steeves, Elizabeth T. Anderson
 668 	0.0001	0.6451	Trude, Angela Cristina Bizzotto
4834 	0.0001	0.6448	Yen, Steven T.
 861 	0.0001	0.6444	Dewailly, Éric
4292 	0.0001	0.6444	Ayotte, Pierre
4373 	0.0001	0.6444	Muckle, Gina
5686 	0.0001	0.6444	Jacobson, Sandra W.
3873 	0.0001	0.6439	Fitzgibbon, Marian L.
3182 	0.0001	0.6438	Coldwell, Susan E.
3754 	0.0001	0.6436	Jones-Smith, Jessica C
2604 	0.0001	0.6434	Low serum selenium is associated with anemia among older adults in the United States
 664 	0.0001	0.6432	Are neighbourhood food resources distributed inequitably by income and race in the USA? Epidemiological findings across the urban spectrum
5119 	0.0001	0.6431	Blood pressure over height ratios: Simple and accurate method of detecting elevated blood pressure in children
3631 	0.0001	0.6429	Jaenicke, Edward C.
 524 	0.0001	0.6427	Ahluwalia, Namanjeet
4266 	0.0001	0.6425	Allen, Barbara McFadden
1465 	0.0001	0.6424	Deb, Partha
3801 

4286 	0.0001	0.5553	Ogata, Beth N.
4768 	0.0001	0.5553	Hamilton, Kathryn K.
4789 	0.0001	0.5553	Robinson, Gretchen Y.
4976 	0.0001	0.5553	Chapel, Denise L.
5666 	0.0001	0.5553	Schmidt, Darrin W.
5889 	0.0001	0.5553	Walters, Nancy G.
1407 	0.0001	0.5528	Annual Distress Database of the Deutsche Bundesbank
 322 	0.0001	0.5516	Rahim, Robbi
1363 	0.0001	0.5516	Abdullah, Dahlan
1424 	0.0001	0.5516	Hidayat, Rahmat
1802 	0.0001	0.5516	Simarmata, Janner
1855 	0.0001	0.5516	Ikhsan Setiawan, Muhammad
1877 	0.0001	0.5516	Napitupulu, Darmawan
2634 	0.0001	0.5516	Rahman, Abdul
3399 	0.0001	0.5516	Nurdiyanto, Heri
3433 	0.0001	0.5516	Ahmar, Ansari Saleh
3795 	0.0001	0.5516	Setiawan, Muhammad Ikhsan
3853 	0.0001	0.5516	Saleh Ahmar, Ansari
4362 	0.0001	0.5516	Albra, Wahyudin
4080 	0.0001	0.5516	Proceedings of the Joint Workshop KO2PI and The 1st International Conference on Advance & Scientific Innovation
5325 	0.0001	0.5503	Ohls, Jana
5263 	0.0001	0.5501	Review of Finance
2183 	0.0001	0.5500	Foos, Dani

3145 	0.0001	0.4548	Quick, Keneisha
3177 	0.0001	0.4548	Kulik, Noel L
3194 	0.0001	0.4548	Frank, Jennifer
3197 	0.0001	0.4548	Lewis, Cora E.
3202 	0.0001	0.4548	Unick, Jessica L
3292 	0.0001	0.4548	Espeland, Mark A.
3536 	0.0001	0.4548	Trautvetter, Jennifer
3572 	0.0001	0.4548	Story, Kathryn
3683 	0.0001	0.4548	Fisher, Michelle
3691 	0.0001	0.4548	Polzien, Kristen Marie
3881 	0.0001	0.4548	Hatley, Karen E.
3919 	0.0001	0.4548	James, Brittany L.
4029 	0.0001	0.4548	Annis, Kristen
4036 	0.0001	0.4548	Gaur, Vinod Kumar
4089 	0.0001	0.4548	Zeigler, Erin
4120 	0.0001	0.4548	Ranslow-Robles, Deborah
4229 	0.0001	0.4548	Strohacker, Kelly
4293 	0.0001	0.4548	Marcovina, Santica M.
4481 	0.0001	0.4548	Zablonski, Stephen T.
4508 	0.0001	0.4548	King, Mark J
4561 	0.0001	0.4548	Neiberg, Rebecca H.
4562 	0.0001	0.4548	Whitehead, Kristen
4593 	0.0001	0.4548	Williams, Sarah Catherine
4596 	0.0001	0.4548	Hontz, Mary A.
4668 	0.0001	0.4548	Group, the Study of Novel Approaches to Weight Gain Prevention (S

3512 	0.0001	0.3463	Caillavet, France
1784 	0.0001	0.3463	Obes Facts
 537 	0.0001	0.3458	Pepin, Dawn
2491 	0.0001	0.3458	J Law Med Ethics
 14  	0.0001	0.3451	Noble, Kimberly G.
 153 	0.0001	0.3451	Marti, Maria Jose
1091 	0.0001	0.3451	Duch, Helena
1143 	0.0001	0.3451	Merz, Emily C.
3310 	0.0001	0.3451	Repka, Kelsey R.
5640 	0.0001	0.3451	Landers, Carine M.
 483 	0.0001	0.3444	Kelly, Diane
 890 	0.0001	0.3444	Ammons, S. W.
2450 	0.0001	0.3439	Hesselius, Patrik
3262 	0.0001	0.3439	Holmlund, Bertil
5718 	0.0001	0.3439	Engstroem, Per
 227 	0.0001	0.3432	Brown-Podgorski, Brittany L.
1237 	0.0001	0.3432	Golembiewski, Elizabeth H.
3628 	0.0001	0.3432	Holmes, Ann M.
3943 	0.0001	0.3432	Jackson, Joanna R.
4043 	0.0001	0.3432	Menachemi, Nir
5494 	0.0001	0.3427	Fairlie, Robert W.
2780 	0.0001	0.3423	Bryson, William C
4125 	0.0001	0.3423	Ronstant, Ola
5572 	0.0001	0.3423	Marshall, Gillian L.
5841 	0.0001	0.3423	Canham, Sarah L.
2574 	0.0001	0.3415	Friis, Karina
3163 	0.0001	0.3415	Larsen, Finn Bre

3370 	0.0001	0.2539	Struempler, Barb
3844 	0.0001	0.2539	Kellegrew, Krysta P.
1208 	0.0001	0.2531	Sosa, Julie Ann
3783 	0.0001	0.2531	Bowen, William G.
4511 	0.0001	0.2531	Lord, Graham M
 58  	0.0001	0.2525	Irvine, Mary K.
 96  	0.0001	0.2525	Thomas, Jacinthe A.
1650 	0.0001	0.2525	Gambone, Gina F.
4137 	0.0001	0.2525	Alexy, Emily R.
 777 	0.0001	0.2517	Cook, Michael Lee
1089 	0.0001	0.2517	Pun, Wik Hung
2790 	0.0001	0.2517	Farkas, George
5297 	0.0001	0.2517	Morgan, Paul L.
 560 	0.0001	0.2517	Contemp Educ Psychol
 564 	0.0001	0.2511	Bruns, Angela
1817 	0.0001	0.2511	Pilkauskas, Natasha V.
1528 	0.0001	0.2506	Meni, David
3522 	0.0001	0.2506	Bayaz Ozturk, Gulgun
3654 	0.0001	0.2506	Wiseman, Michael C
5003 	0.0001	0.2506	Ozturk, Gulgun Bayaz
1323 	0.0001	0.2495	Gibson, Marcia
1345 	0.0001	0.2495	Bambra, Clare
1869 	0.0001	0.2495	Thomson, Hilary
2200 	0.0001	0.2495	Martin, Susan P.
2730 	0.0001	0.2495	Fenton, Candida
3471 	0.0001	0.2495	Lutje, Vittoria
4141 	0.0001	0.2495	Banas, Kasia
578

3039 	0.0001	0.1384	Becot, Florence
4366 	0.0001	0.1384	Smith, Diane Dani
4692 	0.0001	0.1384	Chase, Lisa
5011 	0.0001	0.1384	Greco, Lauren C
5820 	0.0001	0.1384	Estrin, Hans
 572 	0.0001	0.1376	Kraak, Vivica I
3964 	0.0001	0.1376	Obes Rev
 93  	0.0001	0.1374	Brito, Tonya L.
 682 	0.0001	0.1367	Edwards Hall, Leigh Ann
1697 	0.0001	0.1367	Stell Crowley, Phyllis
2070 	0.0001	0.1367	Yakes Jimenez, Elizabeth
3293 	0.0001	0.1367	Fredericks, Doris C.
4395 	0.0001	0.1367	Udarbe, Adrienne Z.
4726 	0.0001	0.1367	Hall, Leigh Ann Edwards
5254 	0.0001	0.1367	Crowley, Phyllis Stell
 316 	0.0001	0.1360	State of California Employment Development Department
3147 	0.0001	0.1358	European Central Bank
3935 	0.0001	0.1356	J Eur Econ Assoc
 308 	0.0001	0.1353	Dharod, Jigna M.
1244 	0.0001	0.1353	Andrews, Jesse
3414 	0.0001	0.1353	Ball, Lanae
1777 	0.0001	0.1347	Talbert, Hope
1993 	0.0001	0.1347	Edin, H.
5045 	0.0001	0.1347	Talbert, Elizabeth
2412 	0.0001	0.1343	Oh, Kyungwon
3618 	0.0001	0.1343	Kim, Hyun Ja

 53  	0.0001	0.0382	Schwarz, Lou
1508 	0.0001	0.0382	Weiss, Roy E.
2262 	0.0001	0.0379	Williams, Mary Beth.
2057 	0.0001	0.0378	Denteh, Augustine
 88  	0.0001	0.0373	Park, Kyong
2954 	0.0001	0.0373	Himes, John H.
4476 	0.0001	0.0373	Geppert, Joni
5674 	0.0001	0.0373	Kersey, Margaret
 226 	0.0001	0.0368	Dobson, Cheyney C.
 377 	0.0001	0.0368	Wyse, Jessica J. B.
4715 	0.0001	0.0365	Worthington, Julie
1693 	0.0001	0.0363	Herrick, Kirsten
3141 	0.0001	0.0363	Paulose-Ram, Ryne
2381 	0.0001	0.0360	Can J Econ
 283 	0.0001	0.0358	Steiner, Abigail S.
5454 	0.0001	0.0357	Thévenot, Céline
3837 	0.0001	0.0354	Saldana, Santiago J
5390 	0.0001	0.0354	Grzywacz, Joseph G.
 826 	0.0001	0.0350	Mager, Ferdinand
2964 	0.0001	0.0350	Schmieder, Christian
1099 	0.0001	0.0348	Hromi-Fiedler, Amber J
5851 	0.0001	0.0346	Bachhuber, Marcus A.
2060 	0.0001	0.0343	Townsend, Marilyn S.
2907 	0.0001	0.0343	Keim, Nancy L.
5226 	0.0001	0.0343	Aaron, Grant J.
 441 	0.0001	0.0338	Li, Ruowei
2631 	0.0001	0.0338	Fein, Sara

Use the `constrain()` function to constrain the graph to the neighborhood of a specified node. This is based on a breadth-first search, with a `limit` parameter to constrain the diameter of the neighborhood in the graph.

In [23]:
SUBGRAPH = nxg_set

def constrain (limit, search_term):
    global SUBGRAPH
    
    for node_id, label in LABELS.items():
        if label == search_term:
            r = nx.bfs_edges(nxg, source=node_id, depth_limit=limit)
            SUBGRAPH = set([node_id])

            for _, neighbor in r:
                SUBGRAPH.add(neighbor)


#constrain(limit=5, search_term="IRI Infoscan")
print(len(SUBGRAPH))

5693


In [24]:
from IPython.core.display import display, Markdown

frags = []
markdown_frag = " - ![#{}](https://placehold.it/15/{}/000000?text=+) `{} {}`"

num_prov = len(SUBGRAPH.intersection(set([get_id(p) for p in providers.keys()])))
frags.append(markdown_frag.format("ffa500", "ffa500", num_prov, "providers (orange)"))

num_data = len(SUBGRAPH.intersection(set([get_id(d) for d in datasets.keys()])))
frags.append(markdown_frag.format("ff0000", "ff0000", num_data, "datasets (red)"))

num_auth = len(SUBGRAPH.intersection(set([get_id(a) for a in authors.keys()])))
frags.append(markdown_frag.format("ff00ff", "ff00ff", num_auth, "authors (purple)"))

num_jour = len(SUBGRAPH.intersection(set([get_id(j) for j in journals.keys()])))
frags.append(markdown_frag.format("008000", "008000", num_jour, "journals (green)"))

num_pubs = len(SUBGRAPH.intersection(set([get_id(p) for p in publications.keys()])))
frags.append(markdown_frag.format("0000ff", "0000ff", num_pubs, "publications (blue)"))

LEGEND_MARKDOWN = "\n".join(frags)
print(LEGEND_MARKDOWN)

 - ![#ffa500](https://placehold.it/15/ffa500/000000?text=+) `96 providers (orange)`
 - ![#ff0000](https://placehold.it/15/ff0000/000000?text=+) `175 datasets (red)`
 - ![#ff00ff](https://placehold.it/15/ff00ff/000000?text=+) `3363 authors (purple)`
 - ![#008000](https://placehold.it/15/008000/000000?text=+) `568 journals (green)`
 - ![#0000ff](https://placehold.it/15/0000ff/000000?text=+) `1491 publications (blue)`


Generate an interactive visualization…

In [25]:
from pyvis.network import Network

g = Network(notebook=True, height="1000px", width="100%")
g.force_atlas_2based()

for p in providers.values():
    if "used" in p:
        p_id = get_id(p["id"])
        
        if p_id in SUBGRAPH:
            scale, impact = IDS_SCALE[p_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["ror"])
            g.add_node(p_id, label=p["title"], title=title, color="orange", size=scale)

for d in datasets.values():
    if "used" in d:
        d_id = get_id(d["id"])
        
        if d_id in SUBGRAPH:
            p_id = get_id(d["provider"])
            scale, impact = IDS_SCALE[d_id]
            title = "{}<br/>rank: {:.4f}<br/>provider: {}".format(d["title"], impact, LABELS[p_id])
            g.add_node(d_id, label=d["title"], title=title, color="red", size=scale)

            if p_id in SUBGRAPH:
                g.add_edge(d_id, p_id, color="gray")

for a in authors.values():
    if "used" in a:
        a_id = get_id(a["id"])

        if a_id in SUBGRAPH:
            scale, impact = IDS_SCALE[a_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(a["title"], impact, a["orcid"])
            g.add_node(a_id, label=a["title"], title=title, color="purple", size=scale)

for j in journals.values():
    if "used" in j:
        j_id = get_id(j["id"])

        if j_id in SUBGRAPH:
            scale, impact = IDS_SCALE[j_id]
            title = "{}<br/>rank: {:.4f}<br/>{}".format(j["title"], impact, j["issn"])
            g.add_node(j_id, label=j["title"], title=title, color="green", size=scale)

for p in publications.values():
    p_id = get_id(p["id"])

    if p_id in SUBGRAPH:
        scale, impact = IDS_SCALE[p_id]
        title = "{}<br/>rank: {:.4f}<br/>{}".format(p["title"], impact, p["doi"])
        g.add_node(p_id, label=p["title"], title=title, color="blue", size=scale)

        if p["journal"]:
            j_id = get_id(p["journal"])

            if j_id in SUBGRAPH:
                g.add_edge(p_id, j_id, color="gray")

        for d in p["datasets"]:
            d_id = get_id(d)
            
            if d_id in SUBGRAPH:
                g.add_edge(p_id, d_id, color="gray")

        for a in p["authors"]:
            a_id = get_id(a)
            
            if a_id in SUBGRAPH:
                g.add_edge(p_id, a_id, color="gray")

g.show_buttons()
g.show("corpus.html")

In [26]:
print(LEGEND_MARKDOWN)
display(Markdown(LEGEND_MARKDOWN))

 - ![#ffa500](https://placehold.it/15/ffa500/000000?text=+) `96 providers (orange)`
 - ![#ff0000](https://placehold.it/15/ff0000/000000?text=+) `175 datasets (red)`
 - ![#ff00ff](https://placehold.it/15/ff00ff/000000?text=+) `3363 authors (purple)`
 - ![#008000](https://placehold.it/15/008000/000000?text=+) `568 journals (green)`
 - ![#0000ff](https://placehold.it/15/0000ff/000000?text=+) `1491 publications (blue)`


 - ![#ffa500](https://placehold.it/15/ffa500/000000?text=+) `96 providers (orange)`
 - ![#ff0000](https://placehold.it/15/ff0000/000000?text=+) `175 datasets (red)`
 - ![#ff00ff](https://placehold.it/15/ff00ff/000000?text=+) `3363 authors (purple)`
 - ![#008000](https://placehold.it/15/008000/000000?text=+) `568 journals (green)`
 - ![#0000ff](https://placehold.it/15/0000ff/000000?text=+) `1491 publications (blue)`