# Goal of the notebook
End to end pipeline for searching articles of interest, extracting entities of interest, building, accessing and deploying a knowled graph and a co-mention graph.

In [12]:
import getpass
import os
import json
import pathlib
import time

import pandas as pd
import networkx as nx

import requests
import sqlalchemy
import ipywidgets

from bbsearch.widgets import ArticleSaver, SearchWidget, MiningWidget, SchemaRequest

In [13]:
pd.options.mode.chained_assignment = None

In [14]:
from jupyter_dash.comms import _send_jupyter_config_comm_request, _jupyter_config
from jupyter_dash import JupyterDash

import dash_cytoscape as cyto

from tqdm.notebook import tqdm

In [15]:
# JupyterDash configs
_send_jupyter_config_comm_request()

In [17]:
time.sleep(3)
JupyterDash.infer_jupyter_proxy_config()

In [18]:
cyto.load_extra_layouts()

In [19]:
from cord_analytics.utils import (generate_curation_table,
                                  link_ontology,
                                  generate_comention_analysis,
                                  build_cytoscape_data,
                                  merge_with_ontology_linking,
                                  resolve_taxonomy_to_types)
            
from bbg_apps.curation_app import (curation_app)
from bbg_apps.visualization_app import (visualization_app)

In [55]:
# table_extractions = pd.read_csv("table_extract.csv")
curated_table_extractions = pd.read_csv("curated_table.csv")
curated_table_extractions = curated_table_extractions.set_index("entity")
factor_counts = {'paper': 18, 'section': 1870, 'paragraph': 8512}

# Set a Project

The user chooses / creates a project to host a KG.

* Use the [Nexus web application](https://bbp.epfl.ch/nexus/web) to get a token.
* Once a token is obtained then proceed to paste it below.

In [21]:
import getpass
TOKEN = getpass.getpass()

 ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทย

In [22]:
from kgforge.core import KnowledgeGraphForge

In [23]:
# Configure a 'forge' to manage (create, access and deploy) the knowledge graph within a given Blue Brain Nexus Project.
FORGE_CONFIG_FILE = os.getenv("FORGE_CONFIG_FILE") 
assert (FORGE_CONFIG_FILE is not None) 
forge = KnowledgeGraphForge(FORGE_CONFIG_FILE,token=TOKEN, debug=True)

# Set topic
The user defines a topic.

In [24]:
import jwt
from IPython.display import display, HTML

In [25]:
topic_resource=None
kg_resource=None
agent_username = jwt.decode(TOKEN,  verify=False)['preferred_username']

def save_topic(b):
    output.clear_output()
    output2.clear_output()
    output3.clear_output()
    topic_to_save = {
        'id': str(widget.children[1].children[0].value).replace(' ', '_'),
        'type': 'Topic',
        'name': widget.children[1].children[0].value,
        'field': widget.children[1].children[1].value,
        'description': widget.children[1].children[2].value,
        'keywords': widget.children[1].children[3].value,
        'question':  [widget.children[1].children[i].value for i in range(5,9)]
    }
    global topic_resource
    topic_resource = forge.from_json(topic_to_save)
    forge.register(topic_resource)
    with output2:
        if w1.value == "":
            print("Please provide a topic name")
        else:
            print("Topic saved!")
            w1.value = ""
            w2.value = ""
            w3.value = ""
            w4.value = ""
            w5.value = ""
            w6.value = ""
            w7.value = ""
            w8.value = ""

def get_topics(b):
    output.clear_output()
    output2.clear_output()
    output3.clear_output()
    query = f"""
    SELECT ?id ?name ?description ?keywords ?field ?question ?createdAt
    WHERE {{
        ?id a Topic ;
            name ?name ;
            description ?description ;
            keywords ?keywords ;
            field ?field ;
            question ?question ;
            <https://bluebrain.github.io/nexus/vocabulary/deprecated> false ;
            <https://bluebrain.github.io/nexus/vocabulary/createdAt> ?createdAt ;
            <https://bluebrain.github.io/nexus/vocabulary/createdBy> <{forge._store.endpoint}/realms/bbp/users/{agent_username}> .
    }}
    """
    resources = forge.sparql(query, limit=100)
    if len(resources) >= 1:
        global topics_df
        topics_df = forge.as_dataframe(resources)
        output.clear_output()
        with output:
            topics_list = list(set(topics_df.name))
            topics_list.sort()
            w0.options = [""] + topics_list
            w0.value = ""
            w0.placeholder = "Select topic"
            w0.observe(topics_change, names='value')
            display(w0)
            display(s12)
    else:
        with output:
            print("No topics found!")

def topics_change(change):
    output3.clear_output()
    with output:
        if len(output.outputs) >= 1:
            output.outputs = (output.outputs[0],)
        s5.value = ""
        s6.value = ""
        s7.value = ""
        s8.value = ""
        s9.value = ""
        s10.value = ""
        s11.value = ""
        global topic_resource
        if change['new'] != "":
            topic_resource = forge.retrieve(list(set(topics_df[topics_df.name == change['new']].id))[0])
            s5.value = topic_resource.field
            s6.value = topic_resource.description
            s7.value = topic_resource.keywords
            question = topic_resource.question
            if isinstance(question, str):
                question = [question]
            if isinstance(question, list):
                for i in range(len(question)):
                    sq.children[i].value = question[i]            
        display(s12)

def update_topic(b):
    output2.clear_output()
    if w0.value != "":
        topic_resource.id = forge.as_jsonld(topic_resource, form="expanded")['@id']
        topic_resource.field = s5.value
        topic_resource.description = s6.value
        topic_resource.keywords = s7.value
        topic_resource.question = [sq.children[i].value for i in range(0,4)]
        forge.update(topic_resource)
        with output:
            print("topic updated!")
        
def get_datasets(b):
    output3.clear_output()
    if w0.value != "":
        topic_resource_id = forge.as_jsonld(topic_resource, form="expanded")['@id']
        query = f"""
            SELECT ?id ?name ?description ?keywords ?field ?question ?createdAt
            WHERE {{
                ?id a Dataset ;
                    name ?name ;
                    about <{topic_resource_id}> ;
                    <https://bluebrain.github.io/nexus/vocabulary/deprecated> false ;
                    <https://bluebrain.github.io/nexus/vocabulary/createdAt> ?createdAt ;
                    <https://bluebrain.github.io/nexus/vocabulary/createdBy> <{forge._store.endpoint}/realms/bbp/users/{agent_username}> .
            }}
            """
        global kg_resources
        kg_resources = forge.sparql(query, limit=100, debug=True)
        print(len(kg_resources))
        if len(kg_resources) >= 1:
            with output3:
                display(s2)
                s2.options = [r.name for r in kg_resources]
                display(s3)
        else:
            with output3:
                print("No datasets found!")
        
def download_dataset(b):
    resource_id = [r.id for r in kg_resources if r.name == s2.value][0]
    global kg_resource
    global table_extractions
    kg_resource = forge.retrieve(resource_id)
    forge.download(kg_resource, "distribution.contentUrl", "/tmp/", overwrite=True)
    for r in kg_resource.distribution:
        if "curated" in r.name:
            table_extractions = pd.read_csv(f"/tmp/{r.name}")
            if table_extractions is not None:
                message = f"Dataset '{r.name}' with {len(table_extractions)} entities ready to be reused. Its content has been assigned to the variable 'table_extractions'. Please continue with the interactive UI section to visualise this dataset."
            else:
                table_extractions = pd.DataFrame()
                message = "No dataset has been downloaded"
            with output3:
                print(message)

s0 = ipywidgets.Button(
    description= '๐ฌ List all your topics',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s1 = ipywidgets.Button(
    description= "๐ Show datasets for selected topic",
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s2 = ipywidgets.RadioButtons(
    description='Select:',
    disabled=False)
s3 = ipywidgets.Button(
    description= '๐ Reuse selected dataset',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s4 = ipywidgets.Button(
    description= 'โ๏ธ Update topic',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s5 = ipywidgets.Text(
    description='Field:',
    disabled=False)
s6 = ipywidgets.Textarea(
    description='Description:',
    disabled=False)
s7 = ipywidgets.Textarea(
    description='Keywords:',
    disabled=False)
s8 = ipywidgets.Text(
    disabled=False)
s9 = ipywidgets.Text(
    disabled=False)
s10 = ipywidgets.Text(
    disabled=False)
s11 = ipywidgets.Text(
    disabled=False)

sq = ipywidgets.VBox(children=[s8, s9, s10, s11])

s12 = ipywidgets.VBox(children=[s5, s6, s7, ipywidgets.Label('Questions:'), sq, s4])

w0 = ipywidgets.Dropdown(
        description='Select topic:',
        disabled=False)
w1 = ipywidgets.Text(
    placeholder='e.g. COVID-19',
    description='Topic name:',
    disabled=False)
w2 = ipywidgets.Text(
    placeholder='e.g. Neuroscience',
    description='Field:',
    disabled=False)
w3 = ipywidgets.Textarea(
    placeholder='Add a description of your topic',
    description='Description:',
    disabled=False)
w4 = ipywidgets.Textarea(
    placeholder='e.g. Coronavirus; COVID-19; SARS; risk factor; glycosylation; sugar; carbohydrates',
    description='Keywords:',
    disabled=False)
w5 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w6 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w7 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w8 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w9 = ipywidgets.Button(
    description='Create',
    button_style='',
    tooltip='Create new topic',
    disabled=False)

output = ipywidgets.Output()
output2 = ipywidgets.Output()
output3 = ipywidgets.Output()

buttons = ipywidgets.HBox(children=[s0, s1])
outputs = ipywidgets.HBox(children=[output, output3])
tab1 = ipywidgets.VBox(children=[buttons, outputs])
tab2 = ipywidgets.VBox(children=[w1, w2, w3, w4, ipywidgets.Label('Please express your research topic in a few questions:'), w5, w6, w7, w8, w9, output2])
widget = ipywidgets.Tab(children=[tab1, tab2])
widget.set_title(0, 'Select topic')
widget.set_title(1, 'Create topic')

w9.on_click(save_topic)
s0.on_click(get_topics)
s1.on_click(get_datasets)
s3.on_click(download_dataset)
s4.on_click(update_topic)

display(widget)

Tab(children=(VBox(children=(HBox(children=(Button(description='๐ฌ List all your topics', layout=Layout(height=โฆ

# Data Import
The user loads data from a data source (CORD-19). The loaded data forms the corpus. The user searches the CORPUS in Blue Brain Search.

Search server URL

In [26]:
SEARCH_ENGINE_URL = os.getenv("SEARCH_ENGINE_URL", "http://dgx1.bbp.epfl.ch:8850")
assert SEARCH_ENGINE_URL is not None

response = requests.post("{}/help".format(SEARCH_ENGINE_URL))
assert response.ok and response.json()['name'] == 'SearchServer', "The server is not accessible"
print(f"This server is using the database: {response.json()['database']}")

This server is using the database: cord19_v47


MySQL URL and engine

In [27]:
MYSQL_DB_URI = os.getenv("MYSQL_DB_URI", "dgx1.bbp.epfl.ch:8853")
bbs_mysql_engine = sqlalchemy.create_engine(f'mysql+pymysql://guest:guest@{MYSQL_DB_URI}/cord19_v47')

Article saver

In [28]:
article_saver = ArticleSaver(connection=bbs_mysql_engine)

Search widget

In [29]:
search_widget = SearchWidget(
    bbs_search_url=SEARCH_ENGINE_URL,
    bbs_mysql_engine=bbs_mysql_engine,
    article_saver=article_saver,
    results_per_page=3)
search_widget

SearchWidget(children=(RadioButtons(description='Model for Sentence Embedding', options=('BSV', 'Sent2Vec'), sโฆ

Status of the Article Saver

In [30]:
article_saver.summary_table()

Unnamed: 0,article_id,paragraph_pos_in_article,option
0,146170,-1,Save full article
1,182010,-1,Save full article
2,227099,-1,Save full article
3,202464,-1,Save full article
4,228228,-1,Save full article
...,...,...,...
95,214754,-1,Save full article
96,187782,-1,Save full article
97,16072,-1,Save full article
98,139943,-1,Save full article


# Set schemas
The user defines the KG schema.

In [31]:
schema_request = SchemaRequest()

In [32]:
columns = ['entity_type', 'property', 'property_type', 'property_value_type', 'ontology_source']

etypes_sources = [('CELL_COMPARTMENT', None),
                  ('CELL_TYPE', None),
                  ('CHEMICAL', 'NCIT'), 
                  ('CONDITION', None),
                  ('DISEASE', 'NCIT'),
                  ('DRUG', None),
                  ('ORGAN', 'NCIT'),
                  ('ORGANISM', 'NCIT'),
                  ('PATHWAY', 'Reactome'),
                  ('PROTEIN', 'NCIT')
                 ]
schema_request_data = [{'entity_type': etype, 'ontology_source': source} 
                       for etype, source in etypes_sources]

schema_request.schema = pd.DataFrame(schema_request_data, columns=columns)
display(schema_request.schema)

Unnamed: 0,entity_type,property,property_type,property_value_type,ontology_source
0,CELL_COMPARTMENT,,,,
1,CELL_TYPE,,,,
2,CHEMICAL,,,,NCIT
3,CONDITION,,,,
4,DISEASE,,,,NCIT
5,DRUG,,,,
6,ORGAN,,,,NCIT
7,ORGANISM,,,,NCIT
8,PATHWAY,,,,Reactome
9,PROTEIN,,,,NCIT


# Create a knowledge graph according to schemas
The user extracts data from the text of a set of papers using selected Named Entity Recognizers and Relation Extractors from Blue Brain Search.
The user can preview the extracted data.
The user curates extracted data.
The user links the extracted entities and relations to ontologies.
The user saves data into Knowledge Graph.

- **input**: raw text
- **output**: csv table of extracted entities/relations

In [33]:
DEFAULT_TEXT = """Autophagy maintains tumour growth through circulating
arginine. Autophagy captures intracellular components and delivers them to
lysosomes, where they are degraded and recycled to sustain metabolism and to
enable survival during starvation. Acute, whole-body deletion of the essential 
autophagy gene Atg7 in adult mice causes a systemic metabolic defect that 
manifests as starvation intolerance and gradual loss of white adipose tissue, 
liver glycogen and muscle mass.  Cancer cells also benefit from autophagy. 
Deletion of essential autophagy genes impairs the metabolism, proliferation, 
survival and malignancy of spontaneous tumours in models of autochthonous 
cancer. Acute, systemic deletion of Atg7 or acute, systemic expression of a 
dominant-negative ATG4b in mice induces greater regression of KRAS-driven 
cancers than does tumour-specific autophagy deletion, which suggests that host 
autophagy promotes tumour growth.
""".replace('\n', ' ').replace('  ', ' ')

In [34]:
TEXT_MINING_URL = os.getenv("TEXT_MINING_URL", "http://dgx1.bbp.epfl.ch:8852")
response = requests.post(TEXT_MINING_URL + "/help")
assert response.ok and response.json()['name'] == 'MiningServer'
print(f"This server is using the database: {response.json()['database']}")

This server is using the database: cord19_v47


In [35]:
mining_widget = MiningWidget(
    mining_server_url=TEXT_MINING_URL,
    schema_request=schema_request,
    article_saver=article_saver,
    default_text=DEFAULT_TEXT)
mining_widget

MiningWidget(children=(Textarea(value='Autophagy maintains tumour growth through circulating arginine. Autophaโฆ

- **input**: csv table of extracted entities/relations
- **output**: knowledge graph

In [36]:
# Get DataFrame of extractions
table_extractions = mining_widget.get_extracted_table()

# Drop duplicates in DataFrame
columns_duplicates = table_extractions.columns.tolist()
columns_duplicates.remove('entity_type')
table_extractions = table_extractions.drop_duplicates(subset=columns_duplicates, keep='first', ignore_index=True)
table_extractions = table_extractions.dropna(subset=["entity"])

## Curate the table with extracted entities

In [37]:
print(f'The table has {table_extractions.shape[0]} rows.')

The table has 56655 rows.


In [38]:
# import pandas as pd
# import numpy as np
# import json
# import uuid

# import operator

# from typing import Iterator, Dict

In [39]:
# import jupyter_server_proxy
# import jupyter_dash
# import dash
# from dash.dependencies import Input, Output, State
# import dash_core_components as dcc
# import dash_table
# import plotly.express as px


# from pygments import highlight
# from pygments.lexers import JsonLdLexer, TurtleLexer
# from pygments.formatters import TerminalFormatter, TerminalTrueColorFormatter

# from tqdm.notebook import tqdm
# from dash.exceptions import PreventUpdate


# def pretty_print(a_json):
#     print(highlight(json.dumps(a_json, indent=2), JsonLdLexer(), TerminalFormatter()))

In [40]:
%%time

print("Setting default term filters: the user can remove them later on in the UI if need be ...")
default_term_filters = 'Glucose; Covid-19; SARS-CoV-2; Diabetes; IL-1; ACE2; glycosylation; hyperglycemia; shock; fatigue; CVD; vasoconstriction; lactate; insulin; SP-D; HbA1c; LDH; glycolysis; GLUT; macrophage; lymphocytes; ventilation;SARS; ARDS; Cytokine Storm; pneumonia; multi-organs failure; thrombosis; inflammation; IL-6; CRP; D-Dimer; Ferritin; Lung Disease; Hypertension; Aging; COPD; angiotensin 2 (or angiotensin II or AngII); Obesity; ICU (intensive care unit); ventilation; ketogenic diet'.split("; ")
filtered_table_extractions = table_extractions.copy()

default_found_term_filters = set() 
for term_filter in default_term_filters:
    entities_to_keep = filtered_table_extractions[
        filtered_table_extractions["entity"].apply(lambda x: x.lower() == term_filter.lower())]["entity"].unique()
    if entities_to_keep is not None and len(entities_to_keep) > 0:
        default_found_term_filters.add(tuple(entities_to_keep))
term_filter_options = [term_filter[0] for term_filter in default_found_term_filters]
print("Done.")

print("Prepating curatation data...")
curation_input_table, factor_counts = generate_curation_table(filtered_table_extractions)
print("Done.")

print("Loading the ontology linking data...")
linking = pd.read_pickle("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/cord_47_linking.pkl")

print("Loading default ontology type mapping...")
with open('/gpfs/bbp.cscs.ch/project/proj116/bbg/ontology-linking/ncit_to_mltypes_mapping.json', "rb") as f:
    default_type_mapping = json.load(f)

print("Done.")

Setting default term filters: the user can remove them later on in the UI if need be ...
Done.
Prepating curatation data...
Cleaning up the entities...
Aggregating occurrences of entities....
Done.
Loading the ontology linking data...
Loading default ontology types...
Done.
CPU times: user 12.1 s, sys: 1.46 s, total: 13.5 s
Wall time: 13.5 s


Run the curation app

In [56]:
curation_app.set_default_terms_to_include(term_filter_options)
curation_app.set_table(curation_input_table.copy())
curation_app.set_ontology_linking_callback(lambda x: link_ontology(linking, default_type_mapping, x))

curation_app.run(port=8077)

Merging the occurrence data with the ontology linking...


## Create a co-mention graph from curated entities

In [46]:
curated_table_extractions = curation_app.get_curated_table()

In [47]:
curated_table_extractions["paper"] = curated_table_extractions["paper"].transform(lambda x: set(x))
curated_table_extractions["paragraph"] = curated_table_extractions["paragraph"].transform(lambda x: set(x))
curated_table_extractions["section"] = curated_table_extractions["section"].transform(lambda x: set(x))

type_data = curated_table_extractions[["entity_type"]].rename(columns={"entity_type": "type"})

graphs, trees = generate_comention_analysis(
    curated_table_extractions, factor_counts, type_data=type_data, min_occurrences=5, n_most_frequent=100, factors=["paper"], cores=10)

-------------------------------
Factor: paper
-------------------------------
Fitering data.....
Selected 100 most frequent terms
Examining 4950 pairs of terms for co-occurrence...
Generated 4941 edges                    
Created a co-occurrence graph:
	number of nodes:  100
	number of edges:  4941
Saving the edges...
Creating a graph object...
Computing degree centrality statistics....
Top n nodes by frequency:
	infectious disorder (1732)
	virus (1656)
	lung (1651)
	heart (1495)
	blood (1479)
	dead (1461)
	cardiovascular (1456)
	diabetes mellitus (1431)
	hypertension (1377)
	pulmonary (1365)

Computing PageRank centrality statistics....
Top n nodes by frequency:
	infectious disorder (0.02)
	virus (0.02)
	lung (0.02)
	heart (0.02)
	blood (0.02)
	dead (0.02)
	cardiovascular (0.02)
	diabetes mellitus (0.02)
	hypertension (0.02)
	pulmonary (0.02)

Computing betweenness centrality statistics....
Detecting communities...
Best network partition:
	 Number of communities: 3
	 Modularity: 0.043

In [48]:
# for n in graphs["paper"].nodes():
#     print(graphs["paper"].nodes[n])

In [49]:
cytoscape_graphs = dict()
for f in ["paper"]:
    cytoscape_graphs[f] = {
        "tree": build_cytoscape_data(trees[f]),
        "graph": build_cytoscape_data(graphs[f])
    }

In [50]:
# # Build knowledge graph from enriched annotations
# import json
# from typing import Iterable, Dict
# from rdflib import Graph
# from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
# import networkx as nx
# from rdflib.namespace import RDF, RDFS, SKOS


# from kganalytics_utils import generate_comention_analysis
# # Generate a paper-based network from a mentions data frame:
# # - we select entities that are mentioned at least 5 times
# # - and we then take only 100 most frequent


In [51]:
# curated_table_extractions_grouped = curated_table_extractions.rename(columns={"entity": "entity_raw"})

# Validate the knowledge graph
Content of the Knowledge Graph is validated. In this version, syntactic validation (i.e. are the identifiers correct, ...) is performed when building the knowledge graph. If the knowledge graph is successfully built then the validation passes. In case of warning (i.e because of a weird character (+,...) in an extracted entity), the user can go back to the curation step and further curate extracted entities. 

# Correct knowledge graph
Correction involves going back to the extraction and/or curation steps.

# Access the knowledge graph
The user can search, visualize, and export the knowledge graph.

In [52]:
%%time
print("Loading pre-generated 3000 graphs...")
with open("/gpfs/bbp.cscs.ch/project/proj116/cytoscape_3000/paper_spanning_tree_3000.cyjs") as f:
    paper_spanning_tree = json.load(f)
    
paper_spanning_tree_list = paper_spanning_tree["elements"]["nodes"] + paper_spanning_tree["elements"]["edges"]
paper_spanning_tree_dict = {elt['data']['uid']:elt for elt in paper_spanning_tree_list  if "uid" in elt['data']}

with open("/gpfs/bbp.cscs.ch/project/proj116/cytoscape_3000/paper_clusters_3000.cyjs","r") as f:
    paper_spanning_clusters = json.load(f)
    
paper_spanning_clusters_list = paper_spanning_clusters["elements"]["nodes"] + paper_spanning_clusters["elements"]["edges"]
paper_spanning_clusters_dict = {elt['data']['uid']:elt for elt in paper_spanning_clusters_list  if "uid" in elt['data']}
print("Done.")

print("Loading pre-generated 3000 styles...")
with open("/gpfs/bbp.cscs.ch/project/proj116/cytoscape_3000/paper_spanning_tree_3000_styles.json","r") as f:
    paper_spanning_tree_styles = json.load(f)
paper_spanning_tree_styles = paper_spanning_tree_styles[0]['style']

with open("/gpfs/bbp.cscs.ch/project/proj116/cytoscape_3000/paper_clusters_3000_styles.json","r") as f:
    paper_spanning_clusters_styles = json.load(f)
paper_spanning_clusters_styles = paper_spanning_clusters_styles[0]['style']
print("Done.")

Loading pre-generated 3000 graphs...
Done.
Loading pre-generated 3000 styles...
Done.
CPU times: user 104 ms, sys: 28.1 ms, total: 132 ms
Wall time: 131 ms


In [53]:
visualization_app.set_graph(
    "Full co-mention graph", cytoscape_graphs["paper"]["graph"][0], cytoscape_graphs["paper"]["graph"][1])
visualization_app.set_graph(
    "Co-mention spanning tree", cytoscape_graphs["paper"]["tree"][0], cytoscape_graphs["paper"]["tree"][1])
visualization_app.set_graph(
    "Pre-computed spanning tree (3000)", paper_spanning_tree_list,
    paper_spanning_tree_dict, paper_spanning_tree_styles)
visualization_app.set_graph(
    "Pre-computed clustered spanning tree (3000)", paper_spanning_clusters_list,
    paper_spanning_clusters_dict, paper_spanning_clusters_styles)

visualization_app.set_current_graph("Full co-mention graph")

In [54]:
visualization_app.run(port="8072")

In [17]:
visualization_app._current_graph

'Full knowledge graph'

In [24]:
# %%time
# import pickle
# print("Loading precomputed co-mention graphs for 3000 extracted entities...")

# factors = ["paper", "section", "paragraph"]
# weights = ["npmi", "ppmi"]
# trees = {}

# precomputed_nodes_df = {}
# precomputed_edges_df = {}
# # open graphs if they where already generated
# graphs = {}
# for factor in tqdm(factors):
#     with open("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/graphs/cord_47/full_{}_3000_edge_list.pkl".format(factor), "rb") as f:
#         edges = pickle.load(f)
#     precomputed_edges_df[factor] = edges

#     graph = nx.from_pandas_edgelist(
#         edges,
#          edge_attr=[
#             "frequency",
#             "ppmi",
#             "npmi",
#             "distance_ppmi",
#             "distance_npmi"

#          ])
#     with open("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/graphs/cord_47/full_{}_3000_node_list.pkl".format(factor), "rb") as f:
#         nodes = pickle.load(f)
#     nx.set_node_attributes(graph, nodes.to_dict("index"))
#     precomputed_nodes_df[factor] = nodes
#     graphs[factor] = graph
    
#     trees[factor] = {}
#     with open("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/graphs/cord_47/full_{}_3000_edge_list.pkl".format(factor), "rb") as f:
#         tree_edges = pickle.load(f)
#         tree_edges = tree_edges.rename(columns={"Source": "source", "Target": "target"})
#         tree = nx.from_pandas_edgelist(tree_edges)
#         trees[factor] = tree
# print("Done")

In [None]:
def create_edge(id, from_id, to_id, label=None, label_size=10, label_color="black", thickness=2, edge_color="grey", edge_style="solid",frequency=1,papers=[]):

        if thickness == 0:
            thickness = 2
        return {
            "data": { 
                "id": str(id),
                "source": str(from_id).lower(),
                "target": str(to_id).lower(),
                "frequency":frequency,
                "papers":papers
            },
            "style": {
               "label": label if label else '',
                "width": thickness
            }
        }

def create_node(id, node_type=None,label=None, label_size=10, label_color="black", radius=30, node_color='grey',frequency={}, definition="",papers=[]):

        actualLabel = None
        if label is not None:
            actualLabel = label.lower()
        else:
            actualLabel = str(id).lower().split("/")[-1].split("#")[-1]
        frequency_raw = frequency['frequency'] if 'frequency' in frequency else 1
        return {
            "data": { 
                "id": str(id).lower(),
                "frequency":frequency_raw,
                "degree_frequency":frequency['degree_frequency'] if 'degree_frequency' in frequency else frequency_raw,
                "pagerank_frequency":frequency['pagerank_frequency'] if 'pagerank_frequency' in frequency else frequency_raw,
                "definition":definition,
                "papers":papers,
                "type":node_type
            },
            "style": {
                "label": actualLabel
            }
        }

In [None]:
import json
import os

import dash
from dash.dependencies import Input, Output, State
import dash_html_components as html
import dash_bootstrap_components as dbc
from kganalytics.paths import top_n_paths, top_n_tripaths
from collections import OrderedDict
from networkx.readwrite.json_graph.cytoscape import cytoscape_data

import dash_cytoscape as cyto

from dash.exceptions import PreventUpdate

In [None]:
def load_json(st):
    if 'http' in st:
        return requests.get(st).json()
    else:
        with open(st, 'rb') as f:
            x = json.load(f)
        return x
    
# Load extra layouts


# app_tab.config['suppress_callback_exceptions']=True
# width = 
# app_tab.height = "800px"
# app_tab.run_server(mode="jupyterlab", width="100%", port="8072")

# Version the knowledge graph
The user can save a knowledge graph with a version.

In [None]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
        
# Temporally save the extracted entities csv file locally
table_extractions_filename = "./table_extractions_%s.csv" % (timestr)
table_extractions.to_csv(table_extractions_filename)


# Temporally save the curated list of extracted entities csv file locally
curated_table_extractions_filename = "./curated_table_extractions_%s.csv" % (timestr)
curated_table_extractions.to_csv(curated_table_extractions_filename)

In [None]:
import jwt
from kgforge.core import Resource
from kgforge.specializations.resources import Dataset

agent = jwt.decode(TOKEN,  verify=False)

agent = forge.reshape(forge.from_json(agent), keep=["name","email","sub","preferred_username"])
agent.id = agent.sub
agent.type = "Person"

dataset = Dataset(forge,name="A dataset", about=topic_resource.name)
dataset.add_distribution(table_extractions_filename, content_type="application/csv")
dataset.add_distribution(curated_table_extractions_filename, content_type="application/csv")
dataset.add_contribution(agent)
dataset.contribution.hadRole= "Scientists"

In [None]:
version = agent.preferred_username+"_"+timestr

def register_dataset(b):
    output4.clear_output()
    output5.clear_output()
    dataset.name = t1.value
    dataset.description = t2.value
    forge.register(dataset)
    if dataset._last_action.succeeded == True:
        with output4:
            print("Dataset registered!")
    else:
        with output4:
            print(dataset._last_action.message)

def version_dataset(b):
    output5.clear_output()
    version = t3.value
    forge.tag(dataset,version)
    if dataset._last_action.succeeded == True:
        with output5:
            print(f"Tagged with: {str(version)}")
    
output4 = ipywidgets.Output()
output5 = ipywidgets.Output()

b1 = ipywidgets.Button(
    description= '๐พ  Register Dataset',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)

b2 = ipywidgets.Button(
    description= '๐ Tag Dataset',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)

t1 = ipywidgets.Text(
    placeholder='Add a name for your dataset',
    description='Name:',
    disabled=False)

t2 = ipywidgets.Textarea(
    placeholder='Add a description of your dataset',
    description='Description:',
    disabled=False)

t3 = ipywidgets.Text(
    description='Tag:',
    value=version,
    disabled=False)

b1.on_click(register_dataset)
b2.on_click(version_dataset)

save_widget = ipywidgets.VBox(children=[t1, t2, b1, output4, t3, b2, output5])

display(save_widget)