# Goal of the notebook
End to end pipeline for searching articles of interest, extracting entities of interest, building, accessing and deploying a knowled graph and a co-mention graph.

In [1]:
import getpass
import json
import os
import pathlib
import requests
import time

import ipywidgets
import networkx as nx
import pandas as pd
import sqlalchemy

import jwt

from bbsearch.widgets import ArticleSaver, MiningSchema, MiningWidget, SearchWidget

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
from jupyter_dash.comms import _send_jupyter_config_comm_request, _jupyter_config
from jupyter_dash import JupyterDash

import dash_cytoscape as cyto

from tqdm.notebook import tqdm

In [4]:
# JupyterDash configs
_send_jupyter_config_comm_request()

In [None]:
time.sleep(3)
JupyterDash.infer_jupyter_proxy_config()
cyto.load_extra_layouts()

In [6]:
from cord_analytics.utils import (generate_curation_table,
                                  link_ontology,
                                  generate_comention_analysis,
                                  build_cytoscape_data,
                                  merge_with_ontology_linking,
                                  resolve_taxonomy_to_types,
                                  list_papers)
            
from bbg_apps.curation_app import (curation_app)
from bbg_apps.visualization_app import (visualization_app)
from bbg_apps.topic_app import (TopicWidget, DataSaverWidget)

from kganalytics.export import load_network

In [7]:
from kgforge.core import KnowledgeGraphForge

In [8]:
print("Loading the ontology linking data...")
linking = pd.read_pickle("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/cord_47_linking.pkl")
print("Done.")
GRAPH_OBJECTS = {}

Loading the ontology linking data...
Done.


# Set a Project

The user chooses / creates a project to host a KG.

* Use the [Nexus web application](https://bbp.epfl.ch/nexus/web) to get a token.
* Once a token is obtained then proceed to paste it below.

In [9]:
import getpass
TOKEN = getpass.getpass()

 ·······································································································································································································································································································································································································································································································································································································································································································································································································································································································································································································································

In [11]:
# Configure a 'forge' to manage (create, access and deploy) the knowledge graph within a given Blue Brain Nexus Project.
FORGE_CONFIG_FILE = os.getenv("FORGE_CONFIG_FILE") 
assert (FORGE_CONFIG_FILE is not None) 
forge = KnowledgeGraphForge(FORGE_CONFIG_FILE,token=TOKEN, debug=True)
agent_username = jwt.decode(TOKEN,  verify=False)['preferred_username']

# Set topic
The user defines a topic.

In [12]:
widget = TopicWidget(forge, agent_username)
widget.display()

Tab(children=(VBox(children=(HBox(children=(Button(description='🔬 List all your topics', layout=Layout(height=…

In [13]:
(
    table_extractions,
    curated_table_extractions,
    curation_meta_data,
    loaded_graphs,
    visualization_configs,
    topic_resource_id
) = widget.get_all()

# Data Import
The user loads data from a data source (CORD-19). The loaded data forms the corpus. The user searches the CORPUS in Blue Brain Search.

Search server URL

In [14]:
SEARCH_ENGINE_URL = os.getenv("SEARCH_ENGINE_URL", "http://dgx1.bbp.epfl.ch:8850")
assert SEARCH_ENGINE_URL is not None

response = requests.post("{}/help".format(SEARCH_ENGINE_URL))
assert response.ok and response.json()['name'] == 'SearchServer', "The server is not accessible"
print(f"This server is using the database: {response.json()['database']}")

This server is using the database: cord19_v47


MySQL URL and engine

In [15]:
MYSQL_DB_URI = os.getenv("MYSQL_DB_URI", "dgx1.bbp.epfl.ch:8853")
bbs_mysql_engine = sqlalchemy.create_engine(f'mysql+pymysql://guest:guest@{MYSQL_DB_URI}/cord19_v47')

Article saver

In [16]:
article_saver = ArticleSaver(connection=bbs_mysql_engine)

Search widget

In [17]:
search_widget = SearchWidget(
    bbs_search_url=SEARCH_ENGINE_URL,
    bbs_mysql_engine=bbs_mysql_engine,
    article_saver=article_saver,
    results_per_page=3)
search_widget

SearchWidget(children=(Textarea(value='Glucose is a risk factor for COVID-19', description='Query', layout=Lay…

Show saved articles and paragraphs

In [18]:
df_results = search_widget.saved_results()
df_results

Unnamed: 0,Article ID,Paragraph #,Paragraph,Article,Title
0,184360,,,✓,DPP4 and ACE2 in Diabetes and COVID-19: Therap...
1,147333,,,✓,Blood glucose levels and COVID-19. Reply to Sa...
2,214728,,,✓,Blood glucose levels and COVID-19. Reply to Sa...
3,214924,,,✓,COVID-19 and diabetes mellitus: how one pandem...
4,179426,,,✓,"Diabetes in COVID-19: Prevalence, pathophysiol..."
5,139943,,,✓,COVID-19 outbreak and pediatric diabetes: Perc...
6,78018,,,✓,Usefulness and safety of remote continuous glu...
7,102473,,,✓,In response: Diabetes is a risk factor for the...
8,211373,,,✓,SARS-CoV-2 and DPP4 inhibition: is it time to ...
9,35198,,,✓,Hyperglycemia at Hospital Admission Is Associa...


In [19]:
print(f"""For information: \n 
      - The query showed {len(df_results['Article ID'].unique())} different articles.
      - Saved {len(df_results[(df_results['Paragraph']=='✓') & (df_results['Article'] != '✓')])} paragraph(s)
      - Saved {len(df_results[df_results['Article']=='✓']['Article ID'].unique())} article(s)""")

For information: 
 
      - The query showed 20 different articles.
      - Saved 0 paragraph(s)
      - Saved 20 article(s)


# Set schemas
The user defines the KG schema.

In [20]:
mining_schema = MiningSchema()

mining_schema.add_entity("CELL_COMPARTMENT")
mining_schema.add_entity("CELL_TYPE")
mining_schema.add_entity("CHEMICAL", ontology_source="NCIT")
mining_schema.add_entity("CONDITION")
mining_schema.add_entity("DISEASE", ontology_source="NCIT")
mining_schema.add_entity("DRUG")
mining_schema.add_entity("ORGAN", ontology_source="NCIT")
mining_schema.add_entity("ORGANISM", ontology_source="NCIT")
mining_schema.add_entity("PATHWAY", ontology_source="Reactome")
mining_schema.add_entity("PROTEIN", ontology_source="NCIT")

mining_schema.df

Unnamed: 0,entity_type,property,property_type,property_value_type,ontology_source
0,CELL_COMPARTMENT,,,,
1,CELL_TYPE,,,,
2,CHEMICAL,,,,NCIT
3,CONDITION,,,,
4,DISEASE,,,,NCIT
5,DRUG,,,,
6,ORGAN,,,,NCIT
7,ORGANISM,,,,NCIT
8,PATHWAY,,,,Reactome
9,PROTEIN,,,,NCIT


# Create a knowledge graph according to schemas
The user extracts data from the text of a set of papers using selected Named Entity Recognizers and Relation Extractors from Blue Brain Search.
The user can preview the extracted data.
The user curates extracted data.
The user links the extracted entities and relations to ontologies.
The user saves data into Knowledge Graph.

- **input**: raw text
- **output**: csv table of extracted entities/relations

In [21]:
TEXT_MINING_URL = os.getenv("TEXT_MINING_URL", "http://dgx1.bbp.epfl.ch:8852")
response = requests.post(TEXT_MINING_URL + "/help")
assert response.ok and response.json()['name'] == 'MiningServer'
print(f"This server is using the database: {response.json()['database']}")

This server is using the database: cord19_v47


In [22]:
mining_widget = MiningWidget(
    mining_server_url=TEXT_MINING_URL,
    mining_schema=mining_schema,
    article_saver=article_saver,
)
mining_widget

MiningWidget(children=(Tab(children=(VBox(children=(Button(description='⚒️  Mine Selected Articles!', layout=L…

In [23]:
# Get DataFrame of extractions
table_extractions = mining_widget.get_extracted_table()

# Drop duplicates in DataFrame
columns_duplicates = table_extractions.columns.tolist()
try:
    columns_duplicates.remove('entity_type')
    table_extractions = table_extractions.drop_duplicates(subset=columns_duplicates, keep='first', ignore_index=True)
    table_extractions = table_extractions.dropna(subset=["entity"])
except ValueError:
    raise ValueError(
        "Could not find the extraction table, make sure you have launched the mining procedure in the widget above"
    )

## Curate the table with extracted entities

- **input**: csv table of extracted entities/relations
- **output**: csv table with curated and ontology linked entities/relations

In [24]:
print(f'The table has {table_extractions.shape[0]} rows.')

The table has 3526 rows.


In [25]:
%%time

print("Setting default term filters: the user can remove them later on in the UI if need be ...")
default_term_filters = 'Glucose; Covid-19; SARS-CoV-2; Diabetes; IL-1; ACE2; glycosylation; hyperglycemia; shock; fatigue; CVD; vasoconstriction; lactate; insulin; SP-D; HbA1c; LDH; glycolysis; GLUT; macrophage; lymphocytes; ventilation;SARS; ARDS; Cytokine Storm; pneumonia; multi-organs failure; thrombosis; inflammation; IL-6; CRP; D-Dimer; Ferritin; Lung Disease; Hypertension; Aging; COPD; angiotensin 2 (or angiotensin II or AngII); Obesity; ICU (intensive care unit); ventilation; ketogenic diet'.split("; ")
filtered_table_extractions = table_extractions.copy()

default_found_term_filters = set() 
for term_filter in default_term_filters:
    entities_to_keep = filtered_table_extractions[
        filtered_table_extractions["entity"].apply(lambda x: x.lower() == term_filter.lower())]["entity"].unique()
    if entities_to_keep is not None and len(entities_to_keep) > 0:
        default_found_term_filters.add(tuple(entities_to_keep))
term_filter_options = [term_filter[0] for term_filter in default_found_term_filters]
print("Done.")

print("Prepating curatation data...")
curation_input_table, factor_counts = generate_curation_table(filtered_table_extractions)
print("Done.")

print("Loading default ontology type mapping...")
with open('/gpfs/bbp.cscs.ch/project/proj116/bbg/ontology-linking/ncit_to_mltypes_mapping.json', "rb") as f:
    default_type_mapping = json.load(f)

print("Done.")

Setting default term filters: the user can remove them later on in the UI if need be ...
Done.
Prepating curatation data...
Cleaning up the entities...
Aggregating occurrences of entities....
Done.
Loading default ontology type mapping...
Done.
CPU times: user 364 ms, sys: 11.1 ms, total: 375 ms
Wall time: 358 ms


Run the curation app. In case of the error 'Address already in use', try specifying another port (for example, in the range 8072-8099)

In [26]:
curation_app.set_default_terms_to_include(term_filter_options)
curation_app.set_table(curation_input_table.copy())
curation_app.set_ontology_linking_callback(lambda x: link_ontology(linking, default_type_mapping, x))

curation_app.run(port=8070)

Merging the occurrence data with the ontology linking...


In [27]:
curated_table_extractions = curation_app.get_curated_table()
curation_meta_data = {
    "factor_counts": factor_counts,
    "nodes_to_keep": curation_app.get_terms_to_include(),
    "n_most_frequent": curation_app.n_most_frequent if curation_app.n_most_frequent else 100
}


In [28]:
curated_table_extractions["paper"] = curated_table_extractions["paper"].apply(lambda x: set(x))
curated_table_extractions["paragraph"] = curated_table_extractions["paragraph"].apply(lambda x: set(x))
curated_table_extractions["section"] = curated_table_extractions["section"].apply(lambda x: set(x))

## Create a co-mention graph from curated entities

- **input**: csv table with curated and ontology linked entities/relations
- **output**: graph objects with co-occurrence network and its spanning tree

In [29]:
type_data = curated_table_extractions[["entity_type"]].rename(columns={"entity_type": "type"})
graphs, trees = generate_comention_analysis(
    curated_table_extractions,  curation_meta_data["factor_counts"],
    n_most_frequent=curation_meta_data["n_most_frequent"], type_data=type_data, 
    factors=["paper", "paragraph"], keep=curation_meta_data["factor_counts"], cores=10)
print("Done.")

-------------------------------
Factor: paper
-------------------------------
Fitering data.....
Selected 223 most frequent terms
Examining 24753 pairs of terms for co-occurrence...
Generated 16789 edges                    
Created a co-occurrence graph:
	number of nodes:  223
	number of edges:  16789
Saving the edges...
Creating a graph object...

Computing degree centrality statistics....
Top n nodes by frequency:
	covid-19 (603)
	diabetes mellitus (600)
	coronavirus (590)
	glucose (526)
	glyburide (521)
	sars coronavirus (511)
	angiotensin-converting enzyme 2 (502)
	virus (502)
	sars-cov-2 (496)
	pneumonia (493)

Computing PageRank centrality statistics....
Top n nodes by frequency:
	covid-19 (0.01)
	diabetes mellitus (0.01)
	coronavirus (0.01)
	glucose (0.01)
	glyburide (0.01)
	sars coronavirus (0.01)
	virus (0.01)
	angiotensin-converting enzyme 2 (0.01)
	sars-cov-2 (0.01)
	pneumonia (0.01)

Computing betweenness centrality statistics....
Detecting communities...
Best network parti

In [30]:
loaded_graphs = None
GRAPH_OBJECTS = {
    "Topic-centered network (paper-based)": {
        "graph": graphs["paper"],
        "tree": trees["paper"],
        "default_top_n": 100
    },
    "Topic-centered network (paragraph-based)": {
        "graph": graphs["paragraph"],
        "tree": trees["paragraph"],
        "default_top_n": 100
    },
}   

## Visualize the co-mention graph

In [None]:
paper_graph = None
paragraph_graph = None

In [32]:
%%time

prefix = "/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/graphs/cord_47/full_3000"

print("Loading pre-generated graphs with 3'000 entities...")
print("\t - paper-based network")
paper_graph = load_network("{}_paper_edge_list.pkl".format(prefix), "{}_paper_node_list.pkl".format(prefix))
paper_spanning_tree = load_network("{}_paper_tree_edge_list.pkl".format(prefix), "{}_paper_tree_node_list.pkl".format(prefix))
nx.set_node_attributes(
    paper_spanning_tree, {
        n: len(paper_spanning_tree.nodes[n]["paper"])
        for n in paper_spanning_tree.nodes()
    },
    "paper_frequency")

print("\t - paragraph-based network")
paragraph_graph = load_network("{}_paragraph_edge_list.pkl".format(prefix), "{}_paragraph_node_list.pkl".format(prefix))
paragraph_spanning_tree = load_network("{}_paragraph_tree_edge_list.pkl".format(prefix), "{}_paragraph_node_list.pkl".format(prefix))
nx.set_node_attributes(
    paragraph_spanning_tree, {
        n: len(paragraph_spanning_tree.nodes[n]["paper"])
        for n in paragraph_spanning_tree.nodes()
    },
    "paper_frequency")
print("Done.")

print("Loading pre-computed node positions...")
with open("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/positions/paper_3000.json", "r") as f:
    paper_positions = json.load(f)

with open("/gpfs/bbp.cscs.ch/project/proj116/network_analytics/data/positions/paragraph_3000.json", "r") as f:
    paragraph_positions = json.load(f)
print("Done.")


print("Building a cytoscape representation of loaded graphs...")
paper_3000_cyto = build_cytoscape_data(paper_spanning_tree, positions=paper_positions)
paragraph_3000_cyto = build_cytoscape_data(paragraph_spanning_tree, positions=paragraph_positions)
print("Done.")

Loading pre-generated graphs with 3'000 entities...
	 - paper-based network


KeyboardInterrupt: 

In [34]:
if loaded_graphs is not None:
    for g, data in loaded_graphs.items():
        GRAPH_OBJECTS[g] = {
            "graph": data["graph"],
            "tree": data["tree"] if "tree" in data else None,
            "default_top_n": 100
        }

# Add precomputed graphs
if paper_graph and paragraph_graph:
    GRAPH_OBJECTS.update({
        "Naive pre-computed network (paper-based, 3000)": {
            "graph": paper_graph,
            "tree": paper_spanning_tree,
            "positions": paper_positions
        },
        "Naive pre-computed network (paragraph-based, 3000)": {
            "graph": paragraph_graph,
            "tree": paragraph_spanning_tree,
            "positions": paragraph_positions,
        }
    })
    
for k, v in GRAPH_OBJECTS.items():
    tree = v["tree"] if "tree" in v else None
    positions = v["positions"] if "positions" in v else None  
    default_top_n = v["default_top_n"] if "default_top_n" in v else None
    full_graph_view = v["full_graph_view"] if "full_graph_view" in v else False
    visualization_app.set_graph(
        k, v["graph"], tree_object=tree, positions=positions,
        default_top_n=default_top_n, full_graph_view=full_graph_view)

if visualization_configs is None:
    visualization_app.set_current_graph("Topic-centered network (paper-based)")
    
visualization_app.set_list_papers_callback(lambda x: list_papers(bbs_mysql_engine, x))
definitions = linking[["concept", "definition"]].groupby("concept").aggregate(lambda x: list(x)[0]).to_dict()["definition"]
visualization_app.set_entity_definitons(definitions)
visualization_app._db_error_message = "Failed to retreive papers (check if the variable 'bbs_mysql_engine' was initialized or check the DB connection)"

if visualization_configs is not None:
    visualization_app._current_graph = visualization_configs["current_graph"]
    visualization_app._configure_layout(visualization_configs)

By default, the app will display only top-50 most frequent nodes, you can then choose to show all the nodes in the network

In [35]:
visualization_app.run(port=8079)

# Validate the knowledge graph
Content of the Knowledge Graph is validated. In this version, syntactic validation (i.e. are the identifiers correct, ...) is performed when building the knowledge graph. If the knowledge graph is successfully built then the validation passes. In case of warning (i.e because of a weird character (+,...) in an extracted entity), the user can go back to the curation step and further curate extracted entities. 

# Correct knowledge graph
Correction involves going back to the extraction and/or curation steps.

# Access the knowledge graph
The user can search, visualize, and export the knowledge graph.

# Version the knowledge graph
The user can save a knowledge graph with a version.

In [45]:
exported_graphs = visualization_app.export_graphs(
    ["Topic-centered network (paper-based)", "Topic-centered network (paragraph-based)"], 
)
visualization_configs = visualization_app.get_configs()
edit_history = visualization_app.get_edit_history()

In [46]:
saver_widget = DataSaverWidget(
    forge, TOKEN, topic_resource_id,
    table_extractions,
    curated_table_extractions,
    curation_meta_data,
    exported_graphs,
    visualization_configs,
    edit_history,
    temp_prefix="/gpfs/bbp.cscs.ch/project/proj116/")

saver_widget.display()

VBox(children=(Text(value='', description='Name:', placeholder='Add a name for your dataset'), Textarea(value=…