<!---
BBSearch is a text mining toolbox focused on scientific use cases.

Copyright (C) 2020  Blue Brain Project, EPFL.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
-->

# Goal of the notebook
End to end pipeline for searching articles of interest, extracting entities of interest, building, accessing and deploying a knowled graph and a co-mention graph.

In [None]:
import getpass
import json
import os
import pathlib
import requests
import time
import sys

import ipywidgets
import networkx as nx
import pandas as pd

import sqlalchemy
from sqlalchemy.sql import select
from sqlalchemy.sql import and_, or_, not_

import jwt

from bluesearch.widgets import ArticleSaver, MiningSchema, MiningWidget, SearchWidget

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
from jupyter_dash.comms import _send_jupyter_config_comm_request, _jupyter_config
from jupyter_dash import JupyterDash

import dash_cytoscape as cyto

from tqdm.notebook import tqdm

In [None]:
# JupyterDash configs
_send_jupyter_config_comm_request()

In [None]:
time.sleep(3)
JupyterDash.infer_jupyter_proxy_config()
cyto.load_extra_layouts()

In [None]:
from kganalytics.export import load_network
from kganalytics.utils import top_n

from cord19kg.utils import (generate_curation_table,
                            link_ontology,
                            generate_cooccurrence_analysis,
                            build_cytoscape_data,
                            resolve_taxonomy_to_types)
from cord19kg.apps.curation_app import curation_app
from cord19kg.apps.visualization_app import visualization_app
from cord19kg.apps.topic_widgets import (TopicWidget, DataSaverWidget)

In [None]:
from kgforge.core import KnowledgeGraphForge

In [None]:
print("Loading the ontology linking data...")

ONTOLOGY_LINKING_DATA_FILE = os.getenv("ONTOLOGY_LINKING_DATA_FILE") 
assert (ONTOLOGY_LINKING_DATA_FILE is not None) 
linking = pd.read_pickle(ONTOLOGY_LINKING_DATA_FILE)
linking = linking.rename(columns=({"_subclassof_label": "taxonomy"}))
print("Done.")
GRAPH_OBJECTS = {}

# Set a Project

The user chooses / creates a project to host a KG.

* Use the [Nexus web application](https://bbp.epfl.ch/nexus/web) to get a token.
* Once a token is obtained then proceed to paste it below.

In [None]:
import getpass
TOKEN = getpass.getpass()

In [None]:
# Configure a 'forge' to manage (create, access and deploy) the knowledge graph within a given Blue Brain Nexus Project.
FORGE_CONFIG_FILE = os.getenv("FORGE_CONFIG_FILE") 
assert (FORGE_CONFIG_FILE is not None) 
forge = KnowledgeGraphForge(FORGE_CONFIG_FILE,token=TOKEN, debug=True)
agent_username = jwt.decode(TOKEN,  verify=False)['preferred_username']

# Set topic
The user defines a topic.

In [None]:
widget = TopicWidget(forge, TOKEN)
widget.display()

In [None]:
(
    table_extractions,
    curated_table_extractions,
    curation_meta_data,
    loaded_graphs,
    visualization_configs,
    topic_resource_id
) = widget.get_all()

# Data Import
The user loads data from a data source (CORD-19). The loaded data forms the corpus. The user searches the CORPUS in Blue Brain Search.

Configure the search server

In [None]:
SEARCH_ENGINE_URL = os.getenv("SEARCH_ENGINE_URL")

In [None]:
if not SEARCH_ENGINE_URL:
    print("The variable SEARCH_ENGINE_URL is not set", file=sys.stderr)
    print(f"Please fix this before proceeding", file=sys.stderr)
else:
    try:
        response = requests.post(f"{SEARCH_ENGINE_URL}/help")
    except requests.exceptions.RequestException as exc:
        print(f"Could not connect to the search server on {SEARCH_ENGINE_URL}", file=sys.stderr)
        print(f"Error: {exc}", file=sys.stderr)
        print(f"Please fix this before proceeding", file=sys.stderr)
    else:
        if not response.ok or response.json().get("name") != "SearchServer":
            print(f"The server at {SEARCH_ENGINE_URL} is not a valid search server", file=sys.stderr)
        else:
            print(f"Connected to the search server on {SEARCH_ENGINE_URL}")
            print(f"This server is using the database: {response.json().get('database')}")

Configure the MySQL server

In [None]:
DB_URL = os.getenv("DB_URL")

In [None]:
if not DB_URL:
    print("The variable DB_URL is not set", file=sys.stderr)
    print(f"Please fix this before proceeding", file=sys.stderr)
else:
    try:
        bbs_mysql_engine = sqlalchemy.create_engine(f"mysql+pymysql://guest:guest@{DB_URL}")
        result = bbs_mysql_engine.execute("select 1").fetchone()
    except sqlalchemy.exc.OperationalError as exc:
        print(f"Can't connect to the MySQL server on {DB_URL}, please fix this before proceeding.", file=sys.stderr)
        print(f"Error: {exc}", file=sys.stderr)
        print(f"Please fix this before proceeding", file=sys.stderr)
    else:
        print(f"Connected to the MySQL server on {DB_URL}")

Article saver

In [None]:
article_saver = ArticleSaver(connection=bbs_mysql_engine)

Search widget

In [None]:
search_widget = SearchWidget(
    bbs_search_url=SEARCH_ENGINE_URL,
    bbs_mysql_engine=bbs_mysql_engine,
    article_saver=article_saver,
    results_per_page=3)
search_widget

Show saved articles and paragraphs

In [None]:
df_results = search_widget.saved_results()
df_results

In [None]:
print(f"""For information: \n 
      - The query showed {len(df_results['Article ID'].unique())} different articles.
      - Saved {len(df_results[(df_results['Paragraph']=='✓') & (df_results['Article'] != '✓')])} paragraph(s)
      - Saved {len(df_results[df_results['Article']=='✓']['Article ID'].unique())} article(s)""")

# Set schemas
The user defines the KG schema.

In [None]:
mining_schema = MiningSchema()

mining_schema.add_entity("CELL_COMPARTMENT")
mining_schema.add_entity("CELL_TYPE")
mining_schema.add_entity("CHEMICAL", ontology_source="NCIT")
mining_schema.add_entity("CONDITION")
mining_schema.add_entity("DISEASE", ontology_source="NCIT")
mining_schema.add_entity("DRUG")
mining_schema.add_entity("ORGAN", ontology_source="NCIT")
mining_schema.add_entity("ORGANISM", ontology_source="NCIT")
mining_schema.add_entity("PATHWAY", ontology_source="Reactome")
mining_schema.add_entity("PROTEIN", ontology_source="NCIT")

mining_schema.df

# Create a knowledge graph according to schemas
The user extracts data from the text of a set of papers using selected Named Entity Recognizers and Relation Extractors from Blue Brain Search.
The user can preview the extracted data.
The user curates extracted data.
The user links the extracted entities and relations to ontologies.
The user saves data into Knowledge Graph.

- **input**: raw text
- **output**: csv table of extracted entities/relations

Configure the mining server

In [None]:
TEXT_MINING_URL = os.getenv("TEXT_MINING_URL")

In [None]:
if not TEXT_MINING_URL:
    print("The variable TEXT_MINING_URL is not set", file=sys.stderr)
    print(f"Please fix this before proceeding", file=sys.stderr)
else:
    try:
        response = requests.post(f"{TEXT_MINING_URL}/help")
    except requests.exceptions.RequestException as exc:
        print(f"Could not connect to the server on {TEXT_MINING_URL}", file=sys.stderr)
        print(f"Error: {exc}", file=sys.stderr)
        print(f"Please fix this before proceeding", file=sys.stderr)
    else:
        if not response.ok or response.json().get("name") != "MiningServer":
            print(f"The server at {TEXT_MINING_URL} is not a valid mining server", file=sys.stderr)
        else:
            print(f"Connected to the mining server on {TEXT_MINING_URL}")
            print(f"This server is using the database: {response.json().get('database')}")

In [None]:
mining_widget = MiningWidget(
    mining_server_url=TEXT_MINING_URL,
    mining_schema=mining_schema,
    article_saver=article_saver,
)
mining_widget

In [None]:
# Get DataFrame of extractions
table_extractions = mining_widget.get_extracted_table()

# Drop duplicates in DataFrame
columns_duplicates = table_extractions.columns.tolist()
try:
    columns_duplicates.remove('entity_type')
    table_extractions = table_extractions.drop_duplicates(subset=columns_duplicates, keep='first', ignore_index=True)
    table_extractions = table_extractions.dropna(subset=["entity"])
except ValueError:
    raise ValueError(
        "Could not find the extraction table, make sure you have launched the mining procedure in the widget above"
    )

## Curate the table with extracted entities

- **input**: csv table of extracted entities/relations
- **output**: csv table with curated and ontology linked entities/relations

In [None]:
print(f'The table has {table_extractions.shape[0]} rows.')

In [None]:
%%time

print("Setting default term filters: the user can remove them later on in the UI if need be ...")
default_term_filters = 'Glucose; Covid-19; SARS-CoV-2; Diabetes; IL-1; ACE2; glycosylation; hyperglycemia; shock; fatigue; CVD; vasoconstriction; lactate; insulin; SP-D; HbA1c; LDH; glycolysis; GLUT; macrophage; lymphocytes; ventilation;SARS; ARDS; Cytokine Storm; pneumonia; multi-organs failure; thrombosis; inflammation; IL-6; CRP; D-Dimer; Ferritin; Lung Disease; Hypertension; Aging; COPD; angiotensin 2 (or angiotensin II or AngII); Obesity; ICU (intensive care unit); ventilation; ketogenic diet'.split("; ")
filtered_table_extractions = table_extractions.copy()
filtered_table_extractions = filtered_table_extractions.rename(columns={"paper_id": "occurrence"})

default_found_term_filters = set() 
for term_filter in default_term_filters:
    entities_to_keep = filtered_table_extractions[
        filtered_table_extractions["entity"].apply(lambda x: x.lower() == term_filter.lower())]["entity"].unique()
    if entities_to_keep is not None and len(entities_to_keep) > 0:
        default_found_term_filters.add(tuple(entities_to_keep))
term_filter_options = [term_filter[0] for term_filter in default_found_term_filters]
print("Done.")

print("Prepating curatation data...")
curation_input_table, factor_counts = generate_curation_table(filtered_table_extractions)
print("Done.")

print("Loading default ontology type mapping...")
TYPE_MAPPING_FILE = os.getenv("TYPE_MAPPING_FILE") 
assert (TYPE_MAPPING_FILE is not None) 
with open(TYPE_MAPPING_FILE, "rb") as f:
    default_type_mapping = json.load(f)

print("Done.")

Run the curation app. In case of the error 'Address already in use', try specifying another port (for example, in the range 8072-8099)

In [None]:
curation_app.set_default_terms_to_include(term_filter_options)
curation_app.set_table(curation_input_table.copy())
curation_app.set_ontology_linking_callback(lambda x: link_ontology(linking, default_type_mapping, x))

curation_app.run(port=8070)

In [None]:
curated_table_extractions = curation_app.get_curated_table()
curation_meta_data = {
    "factor_counts": factor_counts,
    "nodes_to_keep": curation_app.get_terms_to_include(),
    "n_most_frequent": curation_app.n_most_frequent if curation_app.n_most_frequent else 100
}


In [None]:
curated_table_extractions["paper"] = curated_table_extractions["paper"].apply(lambda x: set(x))
curated_table_extractions["paragraph"] = curated_table_extractions["paragraph"].apply(lambda x: set(x))
curated_table_extractions["section"] = curated_table_extractions["section"].apply(lambda x: set(x))

## Create a co-mention graph from curated entities

- **input**: csv table with curated and ontology linked entities/relations
- **output**: graph objects with co-occurrence network and its spanning tree

In [None]:
type_data = curated_table_extractions[["entity_type"]].rename(columns={"entity_type": "type"})
graphs, trees = generate_cooccurrence_analysis(
    curated_table_extractions,  curation_meta_data["factor_counts"],
    n_most_frequent=curation_meta_data["n_most_frequent"], type_data=type_data, 
    factors=["paper", "paragraph"], keep=curation_meta_data["factor_counts"], cores=10)
print("Done.")

In [None]:
loaded_graphs = None
GRAPH_OBJECTS = {
    "Topic-centered network (paper-based)": {
        "graph": graphs["paper"],
        "tree": trees["paper"],
        "default_top_n": 100
    },
    "Topic-centered network (paragraph-based)": {
        "graph": graphs["paragraph"],
        "tree": trees["paragraph"],
        "default_top_n": 100
    },
}   

## Visualize the co-mention graph

In [None]:
if loaded_graphs is not None:
    for g, data in loaded_graphs.items():
        GRAPH_OBJECTS[g] = {
            "graph": data["graph"],
            "tree": data["tree"] if "tree" in data else None,
            "default_top_n": 100
        }

for k, v in GRAPH_OBJECTS.items():
    tree = v["tree"] if "tree" in v else None
    positions = v["positions"] if "positions" in v else None  
    default_top_n = v["default_top_n"] if "default_top_n" in v else None
    full_graph_view = v["full_graph_view"] if "full_graph_view" in v else False
    visualization_app.set_graph(
        k, v["graph"], tree_object=tree, positions=positions,
        default_top_n=default_top_n, full_graph_view=full_graph_view)

if visualization_configs is None:
    visualization_app.set_current_graph("Topic-centered network (paper-based)")

# Set paper meta-data look up callback
def list_papers(mysql_engine, papers, limit=200):
    META_DATA = sqlalchemy.MetaData(bind=mysql_engine, reflect=True)
    articles = META_DATA.tables["articles"]
    clauses = or_( *[articles.c.article_id == x for x in papers[:limit]] )
    s = select([
        articles.c.title,
        articles.c.authors,
        articles.c.abstract,
        articles.c.doi,
        articles.c.url,
        articles.c.journal,
        articles.c.pmcid,
        articles.c.pubmed_id,
        articles.c.publish_time
    ]).where(clauses)
    result = mysql_engine.execute(s)
    results = []
    for row in result:
        results.append(row)
    return results    
visualization_app.set_list_papers_callback(lambda x: list_papers(bbs_mysql_engine, x))

# Set definitions look up callback
definitions = linking[["concept", "definition"]].groupby("concept").aggregate(lambda x: list(x)[0]).to_dict()["definition"]
visualization_app.set_entity_definitons(definitions)
visualization_app._db_error_message = "Failed to retreive papers (check if the variable 'bbs_mysql_engine' was initialized or check the DB connection)"

# Set aggregated entities look up callback
def get_aggregated_entities(entity, n):
    if "aggregated_entities" in curated_table_extractions.columns:
        aggregated = curated_table_extractions.loc[entity]["aggregated_entities"]
    else:
        aggregated = [entity]
    if table_extractions is not None:
        df = curation_input_table.set_index("entity")
        if entity in curated_table_extractions.index:
            freqs = df.loc[aggregated]["paper_frequency"].to_dict()
        else:
            return {}
    else:
        df = table_extractions.copy()
        df["entity"] = data["entity"].apply(lambda x: x.lower())
        freqs = df[df["entity"].apply(lambda x: x.lower() in aggregated)].groupby("entity").aggregate(
            lambda x: len(x))["entity_type"].to_dict()
    if len(freqs) == 0:
        return {}
    return {e: freqs[e] for e in top_n(freqs, n)}

visualization_app.set_aggregated_entities_callback(
    lambda x: get_aggregated_entities(x, 10))

if visualization_configs is not None:
    visualization_app._current_graph = visualization_configs["current_graph"]
    visualization_app._configure_layout(visualization_configs)

By default, the app will display only top-50 most frequent nodes, you can then choose to show all the nodes in the network

In [None]:
visualization_app.run(port=8079)

# Validate the knowledge graph
Content of the Knowledge Graph is validated. In this version, syntactic validation (i.e. are the identifiers correct, ...) is performed when building the knowledge graph. If the knowledge graph is successfully built then the validation passes. In case of warning (i.e because of a weird character (+,...) in an extracted entity), the user can go back to the curation step and further curate extracted entities. 

# Correct knowledge graph
Correction involves going back to the extraction and/or curation steps.

# Access the knowledge graph
The user can search, visualize, and export the knowledge graph.

# Version the knowledge graph
The user can save a knowledge graph with a version.

In [None]:
exported_graphs = visualization_app.export_graphs(
    ["Topic-centered network (paper-based)", "Topic-centered network (paragraph-based)"], 
)
visualization_configs = visualization_app.get_configs()
edit_history = visualization_app.get_edit_history()

In [None]:
TEMP_FILE_PATH = os.getenv("TEMP_FILE_PATH") 
assert (TEMP_FILE_PATH is not None)

saver_widget = DataSaverWidget(
    forge, TOKEN, topic_resource_id,
    table_extractions,
    curated_table_extractions,
    curation_meta_data,
    exported_graphs,
    visualization_configs,
    edit_history,
    temp_prefix=TEMP_FILE_PATH)

saver_widget.display()