# Goal of the notebook
End to end pipeline for searching articles of interest, extracting entities of interest, building, accessing and deploying a knowled graph and a co-mention graph.

In [None]:
import getpass
import os
import pathlib

import pandas as pd
import requests
import sqlalchemy
import ipywidgets

from bbsearch.widgets import ArticleSaver, SearchWidget, MiningWidget, SchemaRequest

# Set a Project

The user chooses / creates a project to host a KG.

* Use the [Nexus web application](https://bbp.epfl.ch/nexus/web) to get a token.
* Once a token is obtained then proceed to paste it below.

In [None]:
import getpass
TOKEN = getpass.getpass()

In [None]:
from kgforge.core import KnowledgeGraphForge

In [None]:
# Configure a 'forge' to manage (create, access and deploy) the knowledge graph within a given Blue Brain Nexus Project.
FORGE_CONFIG_FILE = os.getenv("FORGE_CONFIG_FILE") 
assert (FORGE_CONFIG_FILE is not None) 
forge = KnowledgeGraphForge(FORGE_CONFIG_FILE,token=TOKEN, debug=True)

# Set topic
The user defines a topic.

In [None]:
import jwt
from IPython.display import display, HTML

In [None]:
topic_resource=None
kg_resource=None
agent_username = jwt.decode(TOKEN,  verify=False)['preferred_username']

def save_topic(b):
    output.clear_output()
    output2.clear_output()
    output3.clear_output()
    topic_to_save = {
        'id': str(widget.children[1].children[0].value).replace(' ', '_'),
        'type': 'Topic',
        'name': widget.children[1].children[0].value,
        'field': widget.children[1].children[1].value,
        'description': widget.children[1].children[2].value,
        'keywords': widget.children[1].children[3].value,
        'question':  [widget.children[1].children[i].value for i in range(5,9)]
    }
    global topic_resource
    topic_resource = forge.from_json(topic_to_save)
    forge.register(topic_resource)
    with output2:
        if w1.value == "":
            print("Please provide a topic name")
        else:
            print("Topic saved!")
            w1.value = ""
            w2.value = ""
            w3.value = ""
            w4.value = ""
            w5.value = ""
            w6.value = ""
            w7.value = ""
            w8.value = ""

def get_topics(b):
    output.clear_output()
    output2.clear_output()
    output3.clear_output()
    query = f"""
    SELECT ?id ?name ?description ?keywords ?field ?question ?createdAt
    WHERE {{
        ?id a Topic ;
            name ?name ;
            description ?description ;
            keywords ?keywords ;
            field ?field ;
            question ?question ;
            <https://bluebrain.github.io/nexus/vocabulary/deprecated> false ;
            <https://bluebrain.github.io/nexus/vocabulary/createdAt> ?createdAt ;
            <https://bluebrain.github.io/nexus/vocabulary/createdBy> <{forge._store.endpoint}/realms/bbp/users/{agent_username}> .
    }}
    """
    resources = forge.sparql(query, limit=100)
    if len(resources) >= 1:
        global topics_df
        topics_df = forge.as_dataframe(resources)
        output.clear_output()
        with output:
            topics_list = list(set(topics_df.name))
            topics_list.sort()
            w0.options = [""] + topics_list
            w0.value = ""
            w0.placeholder = "Select topic"
            w0.observe(topics_change, names='value')
            display(w0)
            display(s12)
    else:
        with output:
            print("No topics found!")

def topics_change(change):
    output3.clear_output()
    with output:
        if len(output.outputs) >= 1:
            output.outputs = (output.outputs[0],)
        s5.value = ""
        s6.value = ""
        s7.value = ""
        s8.value = ""
        s9.value = ""
        s10.value = ""
        s11.value = ""
        global topic_resource
        if change['new'] != "":
            topic_resource = forge.retrieve(list(set(topics_df[topics_df.name == change['new']].id))[0])
            s5.value = topic_resource.field
            s6.value = topic_resource.description
            s7.value = topic_resource.keywords
            question = topic_resource.question
            if isinstance(question, str):
                question = [question]
            if isinstance(question, list):
                for i in range(len(question)):
                    sq.children[i].value = question[i]            
        display(s12)

def update_topic(b):
    output2.clear_output()
    if w0.value != "":
        topic_resource.id = forge.as_jsonld(topic_resource, form="expanded")['@id']
        topic_resource.field = s5.value
        topic_resource.description = s6.value
        topic_resource.keywords = s7.value
        topic_resource.question = [sq.children[i].value for i in range(0,4)]
        forge.update(topic_resource)
        with output:
            print("topic updated!")
        
def get_datasets(b):
    output3.clear_output()
    if w0.value != "":
        topic_resource_id = forge.as_jsonld(topic_resource, form="expanded")['@id']
        query = f"""
            SELECT ?id ?name ?description ?keywords ?field ?question ?createdAt
            WHERE {{
                ?id a Dataset ;
                    name ?name ;
                    about <{topic_resource_id}> ;
                    <https://bluebrain.github.io/nexus/vocabulary/deprecated> false ;
                    <https://bluebrain.github.io/nexus/vocabulary/createdAt> ?createdAt ;
                    <https://bluebrain.github.io/nexus/vocabulary/createdBy> <{forge._store.endpoint}/realms/bbp/users/{agent_username}> .
            }}
            """
        global kg_resources
        kg_resources = forge.sparql(query, limit=100, debug=True)
        if len(kg_resources) >= 1:
            with output3:
                display(s2)
                s2.options = [r.name for r in kg_resources]
                display(s3)
        else:
            with output3:
                print("No datasets found!")
        
def download_dataset(b):
    resource_id = [r.id for r in kg_resources if r.name == s2.value][0]
    global kg_resource
    global table_extractions
    kg_resource = forge.retrieve(resource_id)
    forge.download(kg_resource, "distribution.contentUrl", ".", overwrite=True)
    for r in kg_resource.distribution:
        if "curated" in r.name:
            table_extractions = pd.read_csv(f"./{r.name}")
            if table_extractions is not None:
                message = f"Dataset '{r.name}' with {len(table_extractions)} entities ready to be reused. Its content has been assigned to the variable 'table_extractions'. Please continue with the interactive UI section to visualise this dataset."
            else:
                table_extractions = pd.DataFrame()
                message = "No dataset has been downloaded"
            with output3:
                print(message)

s0 = ipywidgets.Button(
    description= '🔬 List all your topics',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s1 = ipywidgets.Button(
    description= "📃 Show datasets for selected topic",
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s2 = ipywidgets.RadioButtons(
    description='Select:',
    disabled=False)
s3 = ipywidgets.Button(
    description= '📈 Reuse selected dataset',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s4 = ipywidgets.Button(
    description= '✏️ Update topic',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)
s5 = ipywidgets.Text(
    description='Field:',
    disabled=False)
s6 = ipywidgets.Textarea(
    description='Description:',
    disabled=False)
s7 = ipywidgets.Textarea(
    description='Keywords:',
    disabled=False)
s8 = ipywidgets.Text(
    disabled=False)
s9 = ipywidgets.Text(
    disabled=False)
s10 = ipywidgets.Text(
    disabled=False)
s11 = ipywidgets.Text(
    disabled=False)

sq = ipywidgets.VBox(children=[s8, s9, s10, s11])

s12 = ipywidgets.VBox(children=[s5, s6, s7, ipywidgets.Label('Questions:'), sq, s4])

w0 = ipywidgets.Dropdown(
        description='Select topic:',
        disabled=False)
w1 = ipywidgets.Text(
    placeholder='e.g. COVID-19',
    description='Topic name:',
    disabled=False)
w2 = ipywidgets.Text(
    placeholder='e.g. Neuroscience',
    description='Field:',
    disabled=False)
w3 = ipywidgets.Textarea(
    placeholder='Add a description of your topic',
    description='Description:',
    disabled=False)
w4 = ipywidgets.Textarea(
    placeholder='e.g. Coronavirus; COVID-19; SARS; risk factor; glycosylation; sugar; carbohydrates',
    description='Keywords:',
    disabled=False)
w5 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w6 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w7 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w8 = ipywidgets.Text(
    placeholder='Add a question about your research topic',
    disabled=False)
w9 = ipywidgets.Button(
    description='Create',
    button_style='',
    tooltip='Create new topic',
    disabled=False)

output = ipywidgets.Output()
output2 = ipywidgets.Output()
output3 = ipywidgets.Output()

buttons = ipywidgets.HBox(children=[s0, s1])
outputs = ipywidgets.HBox(children=[output, output3])
tab1 = ipywidgets.VBox(children=[buttons, outputs])
tab2 = ipywidgets.VBox(children=[w1, w2, w3, w4, ipywidgets.Label('Please express your research topic in a few questions:'), w5, w6, w7, w8, w9, output2])
widget = ipywidgets.Tab(children=[tab1, tab2])
widget.set_title(0, 'Select topic')
widget.set_title(1, 'Create topic')

w9.on_click(save_topic)
s0.on_click(get_topics)
s1.on_click(get_datasets)
s3.on_click(download_dataset)
s4.on_click(update_topic)

display(widget)

# Data Import
The user loads data from a data source (CORD-19).
The loaded data forms the corpus.
The user searches the CORPUS in Blue Brain Search.

Search server URL

In [None]:
SEARCH_ENGINE_URL = os.getenv("SEARCH_ENGINE_URL", "http://dgx1.bbp.epfl.ch:8850")
assert SEARCH_ENGINE_URL is not None

response = requests.post("{}/help".format(SEARCH_ENGINE_URL))
assert response.ok and response.json()['name'] == 'SearchServer', "The server is not accessible"
print(f"This server is using the database: {response.json()['database']}")

MySQL URL and engine

In [None]:
MYSQL_DB_URI = os.getenv("MYSQL_DB_URI", "dgx1.bbp.epfl.ch:8853")
bbs_mysql_engine = sqlalchemy.create_engine(f'mysql+pymysql://guest:guest@{MYSQL_DB_URI}/cord19_v47')

Article saver

In [None]:
article_saver = ArticleSaver(connection=bbs_mysql_engine)

Search widget

In [None]:
search_widget = SearchWidget(
    bbs_search_url=SEARCH_ENGINE_URL,
    bbs_mysql_engine=bbs_mysql_engine,
    article_saver=article_saver,
    results_per_page=3)
search_widget

Status of the Article Saver

In [None]:
article_saver.summary_table()

# Set schemas
The user defines the KG schema.

In [None]:
schema_request = SchemaRequest()

In [None]:
columns = ['entity_type', 'property', 'property_type', 'property_value_type', 'ontology_source']

etypes_sources = [('CELL_COMPARTMENT', None),
                  ('CELL_TYPE', None),
                  ('CHEMICAL', 'NCIT'), 
                  ('CONDITION', None),
                  ('DISEASE', 'NCIT'),
                  ('DRUG', None),
                  ('ORGAN', 'NCIT'),
                  ('ORGANISM', 'NCIT'),
                  ('PATHWAY', 'Reactome'),
                  ('PROTEIN', 'NCIT')
                 ]
schema_request_data = [{'entity_type': etype, 'ontology_source': source} 
                       for etype, source in etypes_sources]

schema_request.schema = pd.DataFrame(schema_request_data, columns=columns)
display(schema_request.schema)

# Create a knowledge graph according to schemas
The user extracts data from the text of a set of papers using selected Named Entity Recognizers and Relation Extractors from Blue Brain Search.
The user can preview the extracted data.
The user curates extracted data.
The user links the extracted entities and relations to ontologies.
The user saves data into Knowledge Graph.

- **input**: raw text
- **output**: csv table of extracted entities/relations

In [None]:
DEFAULT_TEXT = """Autophagy maintains tumour growth through circulating
arginine. Autophagy captures intracellular components and delivers them to
lysosomes, where they are degraded and recycled to sustain metabolism and to
enable survival during starvation. Acute, whole-body deletion of the essential 
autophagy gene Atg7 in adult mice causes a systemic metabolic defect that 
manifests as starvation intolerance and gradual loss of white adipose tissue, 
liver glycogen and muscle mass.  Cancer cells also benefit from autophagy. 
Deletion of essential autophagy genes impairs the metabolism, proliferation, 
survival and malignancy of spontaneous tumours in models of autochthonous 
cancer. Acute, systemic deletion of Atg7 or acute, systemic expression of a 
dominant-negative ATG4b in mice induces greater regression of KRAS-driven 
cancers than does tumour-specific autophagy deletion, which suggests that host 
autophagy promotes tumour growth.
""".replace('\n', ' ').replace('  ', ' ')

In [None]:
TEXT_MINING_URL = os.getenv("TEXT_MINING_URL", "http://dgx1.bbp.epfl.ch:8852")
response = requests.post(TEXT_MINING_URL + "/help")
assert response.ok and response.json()['name'] == 'MiningServer'
print(f"This server is using the database: {response.json()['database']}")

In [None]:
mining_widget = MiningWidget(
    mining_server_url=TEXT_MINING_URL,
    schema_request=schema_request,
    article_saver=article_saver,
    default_text=DEFAULT_TEXT)
mining_widget

- **input**: csv table of extracted entities/relations
- **output**: knowledge graph

In [None]:
# Get DataFrame of extractions
table_extractions = mining_widget.get_extracted_table()

# Drop duplicates in DataFrame
columns_duplicates = table_extractions.columns.tolist()
columns_duplicates.remove('entity_type')
table_extractions = table_extractions.drop_duplicates(subset=columns_duplicates, keep='first', ignore_index=True)
table_extractions = table_extractions.dropna(subset=["entity"])

In [None]:
print(f'The table has {table_extractions.shape[0]} rows.')

In [None]:
import jupyter_server_proxy
import jupyter_dash
import dash
import dash_daq as daq
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import dash_table
from jupyter_dash import JupyterDash
import dash_bootstrap_components as dbc
import plotly.express as px
import operator

In [None]:
from jupyter_dash.comms import _send_jupyter_config_comm_request
_send_jupyter_config_comm_request()


In [None]:
JupyterDash.infer_jupyter_proxy_config()

In [None]:
%%time

print("Setting default term filters: the user can remove them later on in the UI if need be ...")

filtered_table_extractions = table_extractions.copy()
filtered_table_extractions["paper_id"] = filtered_table_extractions["paper_id"].transform(lambda x:  str(x).split(":")[0])

default_term_filters = 'Glucose; Covid-19; SARS-CoV-2; Diabetes; IL-1; ACE2; glycosylation; hyperglycemia; shock; fatigue; CVD; vasoconstriction; lactate; insulin; SP-D; HbA1c; LDH; glycolysis; GLUT; macrophage; lymphocytes; ventilation;SARS; ARDS; Cytokine Storm; pneumonia; multi-organs failure; thrombosis; inflammation; IL-6; CRP; D-Dimer; Ferritin; Lung Disease; Hypertension; Aging; COPD; angiotensin 2 (or angiotensin II or AngII); Obesity; ICU (intensive care unit); ventilation; ketogenic diet'.split("; ")
default_found_term_filters = set() 
for term_filter in default_term_filters:
    result_df = filtered_table_extractions.loc[filtered_table_extractions["entity"].str.lower().eq(str(term_filter).lower())]
    result_df = result_df["entity"].unique()    
    if result_df is not None and len(result_df) > 0:
        default_found_term_filters.add(tuple(result_df))
term_filter_options= [term_filter[0] for term_filter in  default_found_term_filters]

print("Done.")

print("Computing entity frequencies ...")



def _frequency(group_by, retrieve_key, df, distinct_papers=True, debug=False):
    
    if debug:
        display(df.head(100))
    if distinct_papers:
        colunm_stats = df[[group_by, retrieve_key]].groupby(group_by)[retrieve_key].unique()
    else:
        colunm_stats = df[[group_by, retrieve_key]].groupby(group_by)[retrieve_key].count()
    if debug:
        display(colunm_stats)
    
    return colunm_stats
        
entity_stats = _frequency(group_by="entity",retrieve_key="paper_id",df=filtered_table_extractions,distinct_papers=True)

entity_frequency = 1


row_filtered = [row for row in filtered_table_extractions.itertuples() if len(entity_stats[row.entity]) >= int(entity_frequency)]
filtered_table_extractions = pd.DataFrame(row_filtered)
curated_table_extractions = filtered_table_extractions.copy()

print("Done.")

In [None]:

from dash.exceptions import PreventUpdate

app = JupyterDash('Extracted Entities Curation App')

server = app.server

from operator import ge, gt, lt, le, eq, ne


operators = [['ge ', '>='],
             ['le ', '<='],
             ['lt ', '<'],
             ['gt ', '>'],
             ['ne ', '!='],
             ['eq ', '='],
             ['contains '],
             ['datestartswith ']]

dropdown_freq_filter_list = [{"label":">","value":"gt"},
                             {"label":">=","value":"ge"},
                             {"label":"<","value":"lt"},
                             {"label":"<=","value":"le"},
                             {"label":"=","value":"eq"},
                             {"label":"!=","value":"ne"}]

def split_filter_part(filter_part):
    for operator_type in operators:
        for operator in operator_type:
            if operator in filter_part:
                name_part, value_part = filter_part.split(operator, 1)
                name = name_part[name_part.find('{') + 1: name_part.rfind('}')]

                value_part = value_part.strip()
                v0 = value_part[0]
                if (v0 == value_part[-1] and v0 in ("'", '"', '`')):
                    value = value_part[1: -1].replace('\\' + v0, v0)
                else:
                    try:
                        value = float(value_part)
                    except ValueError:
                        value = value_part

                return name, operator_type[0].strip(), value

    return [None] * 3

# Define UI layout

button_group = dbc.ButtonGroup(
    [
        dcc.Upload(
                id='datatable-upload',
                children=html.Div([
                    dbc.Button("Load a CSV File", color="primary", className="mr-1",id="load_file"),
                    dbc.Tooltip(
                        "Load extracted entities in CSV format",
                        target="load_file",
                        placement="bottom",
                    )
                ]),
            className="mr-1"
        )
    ],
     className="mr-1"
)

buttons = dbc.FormGroup(
            [
                 button_group
            ]
        )

dropdown = dbc.FormGroup(
    [
        dbc.InputGroupAddon(
            dbc.Button("Entity requency", color="primary"),
            addon_type="prepend",
            className="mr-1"
        ),
        dcc.Dropdown(
            id='dropdown-freq-filter',
            value="ge",
            clearable=False,
            options = dropdown_freq_filter_list,
            
            className="mr-1"
        ),
        daq.NumericInput(
            id="entityfreqslider",
            min=entity_frequency,  
            max=1000,
            value=entity_frequency,
           className="mr-1"
        )
    ],
    className="mr-1"
)


term_filters = dbc.FormGroup(
    [
       
        dcc.Dropdown(
            id="term_filters",
            multi=True,
            value=term_filter_options,
            className="mr-3"
        )
        
    ],
    className="mr-1",
    row=True
)




term_filters = dbc.InputGroup(
    [
        dbc.InputGroupAddon(
            "Keep",
            addon_type="prepend",
        ),
         dcc.Dropdown(
            id="term_filters",
            multi=True,
             value=term_filter_options,
             style={
                 "width":"80%"
             },
             placeholder="Search for entities to keep",
             
        )
        
    ],
    className="mb-1"
)



reset = dbc.FormGroup(
    [
        dbc.Button("Reset", color="primary", className="mr-1",id='table-reset'),
        dbc.Tooltip(
            "Reset table and graph to original extracted entities and default filters",
            target="table-reset",
            placement="bottom",
        )
    ]
)
                        

form_table = dbc.Form([buttons, dropdown,reset,term_filters],inline=True)

app.layout = html.Div(
    [
       dbc.Row(
            dbc.Col(
                form_table
            )
        ),
        dbc.Row(
            dbc.Col(
                dash_table.DataTable(
                    id='datatable-upload-container',
                    style_cell={
                        'whiteSpace': 'normal'
                    },

                    style_data_conditional=[
                        {
                            'if': {'row_index': 'odd'},
                            'backgroundColor': 'rgb(248, 248, 248)'
                        }
                    ],
                    style_header={
                        'backgroundColor': 'rgb(230, 230, 230)',
                        'fontWeight': 'bold'
                    },

                    css=[
                        {
                            'selector': 'dash-fixed-content',
                            'rule': 'height: 100%;'
                        }
                    ],
                    sort_action="custom", #native
                    sort_mode="multi",
                    column_selectable="multi",
                    filter_action="custom",
                    filter_query='',
                    selected_columns=[],
                    page_action="custom", #native
                    export_format='csv',
                    export_headers='display',
                    merge_duplicate_headers=True,
                    selected_rows=[],
                    page_current=0,
                    page_size=10,
                    sort_by=[]
                )
            )
        ),
        dbc.Row(
            
            dbc.Col(dcc.Graph(id='datatable-upload-Scatter'))
           
        )
    ]
)

# Define callbacks

def parse_contents(contents, filename):
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    if 'csv' in filename:
        return pd.read_csv(
            io.StringIO(decoded.decode('utf-8')))

@app.callback(
    Output('datatable-upload-container', 'style_data_conditional'),
    [Input('datatable-upload-container', 'selected_columns')]
)
def update_styles(selected_columns):
    return [{
        'if': {'column_id': i},
        'background_color': '#D2F3FF'
    } for i in selected_columns]



@app.callback(
    Output("term_filters", "options"),
    [Input("term_filters", "search_value")],
    [State("term_filters", "value"),
    State('datatable-upload-container', 'data')],
)
def update_filter(search_value, values,data):
    
   
    if not search_value and values is None:
        raise PreventUpdate
    res = []
    if values is not None:
        for value in values:
            res.append( {"label":value,"value":value})
            
    result_df = non_deleted_table_extractions.loc[non_deleted_table_extractions["entity"].str.contains(str(search_value))]
    result_df = result_df["entity"].unique()
    if result_df is not None:
        for result in result_df:
            res.append( {"label":result,"value":result})

        return res
    else:
        raise PreventUpdate


@app.callback([Output('entityfreqslider', 'value'),
               Output('dropdown-freq-filter', 'value')],
              [ Input('table-reset', 'n_clicks')],
             [State('entityfreqslider', 'value'),
              State('dropdown-freq-filter', 'value')])
def reset(reset, entityfreq,freqoperator):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'No clicks yet'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
                
    if button_id == "table-reset" or button_id == "No clicks yet":
        global curated_table_extractions
        curated_table_extractions = filtered_table_extractions
        global non_deleted_table_extractions
        non_deleted_table_extractions = filtered_table_extractions
        return [entity_frequency,"ge"]
    
import traceback    

def get_freq(row, operator, filter_value, term_filters):
    return eval(operator)(len(entity_stats[row['entity']]),int(filter_value)) or str(row['entity']).lower() in term_filters
        
non_deleted_table_extractions = filtered_table_extractions        
@app.callback([
               Output('datatable-upload-container', 'data'),
               Output('datatable-upload-container', 'columns'),
               Output('datatable-upload-container', 'editable'),
               Output('datatable-upload-container', 'row_deletable'),
               Output('datatable-upload-container', 'page_count')],
              [Input('datatable-upload-container', 'page_size'),
               Input('datatable-upload-container', 'page_current'),
               Input('datatable-upload-container','data_timestamp'),
               Input('datatable-upload', 'contents'),
               Input('entityfreqslider', 'value'),
               Input('dropdown-freq-filter', 'value'),
              Input('datatable-upload-container', 'sort_by'),
              Input('datatable-upload-container', 'filter_query')],
              [State("datatable-upload-container", "data"),
               State("datatable-upload-container", "columns"),
              State('datatable-upload', 'filename'),
              State('datatable-upload-container', 'derived_viewport_data'),
                State("term_filters", "value")
              ])

def update_output(page_size, page_current,ts,upload,entityfreq,
                  freqoperator,sort_by,filter_query,data,
                  columns, filename,derived_viewport_data, 
                  term_filters):
    try:
        ctx = dash.callback_context
        if not ctx.triggered:
            button_id = 'No clicks yet'
        else:
            button_id = ctx.triggered[0]['prop_id'].split('.')[0]        
        if term_filters is not None:
            term_filters = [str(term_filter_value).lower() for term_filter_value in term_filters ]
        else:
            term_filters = []
        if upload is not None:
            global curated_table_extractions
            curated_table_extractions = parse_contents(upload, filename).copy()
            
        elif button_id == "table-reset":
            curated_table_extractions = filtered_table_extractions.to_dict('records')
        
        elif derived_viewport_data:
            
            removed = [row for row in derived_viewport_data if row not in data and str(row["entity"]).lower() not in term_filters]
            global non_deleted_table_extractions
            for row in removed:
                curated_table_extractions= curated_table_extractions[curated_table_extractions.entity.str.lower() != str(row["entity"]).lower()]
                non_deleted_table_extractions=non_deleted_table_extractions[non_deleted_table_extractions.entity.str.lower() != str(row["entity"]).lower()]

        result = curated_table_extractions
        columns= [{"name": i, "id": i, "clearable": True, "selectable": True, "renamable": True, "hideable": True, "deletable": False} for i in curated_table_extractions.columns ]


        if (button_id == "entityfreqslider" or button_id=="dropdown-freq-filter")  and 'paper_id' in curated_table_extractions:
            row_filtered = []
           
            curated_table_extractions =non_deleted_table_extractions[non_deleted_table_extractions.apply(lambda row: get_freq(row,freqoperator,entityfreq,term_filters), axis=1)]
            result = curated_table_extractions
        
        # Filter by properties

        dff = result
        if filter_query:
            filtering_expressions = filter_query.split(' && ')
            for filter_part in filtering_expressions:
                col_name, operator, filter_value = split_filter_part(filter_part)

                if operator in ('eq', 'ne', 'lt', 'le', 'gt', 'ge'):
                    dff = dff.loc[getattr(dff[col_name], operator)(filter_value)]
                elif operator == 'contains':
                    dff = dff.loc[dff[col_name].str.contains(filter_value)]
                elif operator == 'datestartswith':
                    dff = dff.loc[dff[col_name].str.startswith(filter_value)]
            
        # Sorting by properties
        if sort_by and len(sort_by):
            result_sorted = dff.sort_values(
                [col['column_id'] for col in sort_by],
                ascending=[
                    col['direction'] == 'asc'
                    for col in sort_by
                ],
                inplace=False
            )
        else:
            result_sorted = dff
            
        result_paginated= result_sorted.iloc[
            page_current*page_size:(page_current+ 1)*page_size
        ]
                
        page_count = len(result_sorted) // page_size
        
        return result_paginated.to_dict('records'), columns, True, True, page_count
    except Exception as e:
        traceback.print_exc()





@app.callback([Output('datatable-upload-Scatter', 'figure')],
              [Input('datatable-upload-container', 'data_timestamp'),
               Input('datatable-upload-container', 'data')],)
def display_graph(dts, rows):
    df = curated_table_extractions
    
    if (df.empty or len(df.columns) < 1):
        
        scatter = {
                'data': [{
                    'x': [],
                    'y': []
                }]
            }
    else:
        if "paper_id" in df:
            df["paper_id"] = df["paper_id"].transform(lambda x:  str(x).split(":")[0])
            df_grouped = df[["paper_id","entity_type","entity"]].groupby(["entity","entity_type"]).paper_id.nunique().reset_index()
            df_grouped = df_grouped.rename(columns={"paper_id": "Frequency"})
            scatter = px.scatter(df_grouped, x=df_grouped.entity, y=df_grouped.Frequency, color="entity_type")
    return [scatter]


app.width = "100%"
app.height = "100%"
app.run_server(mode="jupyterlab",port=8071)

In [None]:
from typing import Iterator, Dict
import pandas as pd
import numpy as np
from pygments import highlight
from pygments.lexers import JsonLdLexer, TurtleLexer
from pygments.formatters import TerminalFormatter, TerminalTrueColorFormatter
import json
import uuid
from tqdm.notebook import tqdm

def pretty_print(a_json):
    print(highlight(json.dumps(a_json, indent=2), JsonLdLexer(), TerminalFormatter()))

In [None]:
%%time
from kgforge.core import Resource
from kgforge.specializations.mappings import DictionaryMapping
import uuid


ANNOTATION_MAPPING_FILE = os.getenv("ANNOTATION_MAPPING_FILE") 
assert (ANNOTATION_MAPPING_FILE is not None) 

PROPERTY_MAPPING_FILE = os.getenv("PROPERTY_MAPPING_FILE") 
assert (PROPERTY_MAPPING_FILE is not None) 

annotation_maping = DictionaryMapping.load(ANNOTATION_MAPPING_FILE)
property_maping = DictionaryMapping.load(PROPERTY_MAPPING_FILE)

ressources_json = curated_table_extractions.to_dict('records')
ressources_json = [dict(resource_json, **{"id":str(uuid.uuid4())}) for resource_json in tqdm(ressources_json)]

print("Preparing "+str(len(curated_table_extractions))+" selected entities for ontology linking ...")
annotations = forge.map(ressources_json,[annotation_maping],na='')
ressources_prop_mapped = forge.map(ressources_json,[property_maping],na='')
print("Done ")

import math

for i,r in tqdm(enumerate(ressources_json)):
    if 'property' in r and not math.isnan(r['property']):
        annotations[i].target.selector.value.__setattr__(r['property'], ressources_prop_mapped[i])
        annotations[i].body.__setattr__(r['property'], ressources_prop_mapped[i])
        
print(f'{len(annotations)} annotations created.')


In [None]:
import pickle

import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

from joblib import parallel_backend
class Candidate:
    
    def __init__(self, distance, alias, uid, concept, definition):
        self.distance = distance
        self.alias = alias
        self.uid = uid
        self.concept = concept
        self.definition = definition
    
    def __repr__(self):
        attrs = (f"{k}={v!r}" for k, v in self.__dict__.items())
        return f"Candidate({', '.join(attrs)})"

class EntityLinker:
    
    def __init__(self, bulk):
        self.bulk = bulk
        self.ontology = None
        self.aliases = None
        self.model = None
        self.index = None
    
    def link(self, mentions, threshold=0.8):
        selections = self.candidates(mentions, 3)
        return [self.disambiguate(cs, m, None, threshold) for m, cs in selections]
    
    def disambiguate(self, candidates, mention, context, threshold):
        # TODO Disambiguation requires the component to be part of the NLP pipeline.        
        zeros = [x for x in candidates if x.distance == 0]
        if zeros:
            chosen = sorted(zeros, key=lambda x: len(x.concept))[0]
            return chosen
        else:
            chosen = sorted(candidates, key=lambda x: x.distance)[0]
            return chosen if chosen.distance <= threshold else None
    
    def candidates(self, mentions, limit):
        def _(d, i):
            alias, uid = self.aliases[int(i)]
            return Candidate(d, alias, uid, *self.ontology[uid])
        mentions_index = [(i,str(mention)) for i,mention in enumerate(mentions)]
        mentions_labels = {str(mention) for mention in mentions}
        embeddings = self.model.transform(mentions_labels)
        
        if self.bulk:
            distances, indexes = self.index.search(embeddings.toarray(), limit)
        else:
           
            distances = None
            indexes = None
            with parallel_backend('threading', n_jobs=10):
                distances, indexes = self.index.kneighbors(embeddings, limit)
        results = np.stack((distances, indexes), axis=2)
        i_res= {m: [_(d, i) for d, i in rs] for m, rs in zip(mentions_labels, results)}
        return [(m, i_res[m]) for i, m in mentions_index]
        
        
    def train(self, ontology, model_params, index_params):
        self.ontology = {k: (v[0], v[2]) for k, v in ontology.items()}
        self.model = TfidfVectorizer(**model_params)
        aliases = [(x, k) for k, v in ontology.items() for x in [v[0], *v[1]]]
        embeddings = self.model.fit_transform(x for x, _ in aliases)
        flags = np.array(embeddings.sum(axis=1) != 0).reshape(-1)
        filtered_embeddings = embeddings[flags]
        self.aliases = [t for t, f in zip(aliases, flags) if f]
        if self.bulk:
            self.index = faiss.IndexFlatL2(filtered_embeddings.shape[1])
            self.index.add(filtered_embeddings.toarray())
        else:
            self.index = NearestNeighbors(**index_params)
            self.index.fit(filtered_embeddings)
        self._stats()
    
    def save_pretrained(self, dirpath):
        with open(f'{dirpath}/model', 'wb') as f:
            pickle.dump(linker.ontology, f)
            pickle.dump(linker.aliases, f)
            pickle.dump(linker.model, f)
            if not self.bulk:
                pickle.dump(linker.index, f)
        if self.bulk:
            faiss.write_index(linker.index, f'{dirpath}/index')
    
    @staticmethod
    def from_pretrained(dirpath, bulk):
        linker = EntityLinker(bulk)
        with open(f'{dirpath}/model', 'rb') as f:
            linker.ontology = pickle.load(f)
            linker.aliases = pickle.load(f)
            linker.model = pickle.load(f)
            if not bulk:
                linker.index = pickle.load(f)
        if bulk:
            linker.index = index
        linker._stats()
        return linker
    
    def _stats(self):
        ccount = len(self.ontology)
        tcount = len(self.aliases)
        print(f'INFO   EntityLinker   Links to {ccount} concepts ({tcount} aliases).')

In [None]:
%%time
ONTOLOGY_LINKING_MODEL_PATH = os.getenv("ONTOLOGY_LINKING_MODEL_PATH")
assert (ONTOLOGY_LINKING_MODEL_PATH is not None)
linker = EntityLinker.from_pretrained(ONTOLOGY_LINKING_MODEL_PATH, bulk=False)

In [None]:
%%time
from typing import Iterable, Dict, Iterator
from copy import deepcopy

def enrich_annotations(annotations: Iterable[Dict], linker: EntityLinker) -> Iterator[Dict]:
    def _(ann, can):
        new = deepcopy(ann)
        #pretty_print(ann)
        if can:
            new.body = {
                '@id': can.uid,
                '@type': forge.as_json(ann)["body"]["@type"],
                'label': can.concept,
                'definition':can.definition
            }
        else:
            new.body = {
                '@id': 'https://bbp.epfl.ch/nexus/v1/resources/covid19-kg/data/entity/'+new.body.id[0],
                '@type': forge.as_json(ann)["body"]["@type"],
                'label': new.body.label
            }
        return new
    mentions = [x.target.selector.value.label for x in annotations]
    print("Linking "+str(len(curated_table_extractions))+" extracted entities to ontology terms ...")
    linked_mentions = linker.link(mentions)
    return (_(ann, can) for ann, can in tqdm(zip(annotations, linked_mentions)))

enriched_annotations = list(enrich_annotations(annotations, linker))
print("Done")

# Validate the knowledge graph
Content of the Knowledge Graph is validated. In this version, syntactic validation (i.e. are the identifiers correct, ...) is performed when building the knowledge graph. If the knowledge graph is successfully built then the validation passes. In case of warning (i.e because of a weird character (+,...) in an extracted entity), the user can go back to the curation step and further curate extracted entities. 

In [None]:
%%time
# Build knowledge graph from enriched annotations
import json
from typing import Iterable, Dict
from rdflib import Graph

def load_knowledge_graph(jsonlds: Iterable[Dict]) -> Graph:
    g = Graph()
    for x in tqdm(jsonlds):
        x = forge.as_jsonld(x, form="expanded")
        g.parse(data=json.dumps(x), format='json-ld')
    return g

print("Generating the knowledge graph ...")
knowledge_graph = load_knowledge_graph(enriched_annotations)
print("Done.")
print(f'The knowledge graph has {len(knowledge_graph)} triples.')

content_graph= Graph()
import rdflib
for o in knowledge_graph.objects(None,rdflib.term.URIRef("http://www.w3.org/ns/anno.jsonld/hasBody")):
    for ss, pp, oo in knowledge_graph.triples((rdflib.term.URIRef(o),None,None)):
        content_graph.add((ss,pp,oo))

# Correct knowledge graph
Correction involves going back to the extraction and/or curation steps.

# Access the knowledge graph
The user can search, visualize, and export the knowledge graph.

In [None]:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
import networkx as nx
from rdflib.namespace import RDF, RDFS, SKOS

In [None]:
%%time
print("Building co-mention graph")
reshaped_enriched_annotations = forge.reshape(enriched_annotations, keep=["body","target.selector.exact","target.source"])

enriched_annotations_df = forge.as_dataframe(reshaped_enriched_annotations)

def _build_co_mention(group_by, retrieve_key,df):
    entity_co_mention = df[[group_by,retrieve_key]].groupby(group_by)
    group_keys = list(entity_co_mention.groups.keys())
    all_co_mentions = [entity_co_mention.get_group(group_key)[retrieve_key].dropna().unique() for group_key in group_keys]
    return entity_co_mention, group_keys,all_co_mentions
        
enriched_entity_stats = _frequency(group_by="body.@id",retrieve_key="target.source",df=enriched_annotations_df,distinct_papers=True)
relation_stats = _frequency(group_by="property",retrieve_key="paper_id",df=curated_table_extractions,distinct_papers=True)

entity_co_mention, paper_ids,all_co_mentions = _build_co_mention(group_by="target.source",retrieve_key= "body.@id",df=enriched_annotations_df)

comention_graph= rdflib.ConjunctiveGraph()

import rdflib
from rdflib import Graph, Namespace
from rdflib.paths import Path


comentioned_dict = {}
   
for paper_id in paper_ids:
    comentioned_entities = entity_co_mention.get_group(paper_id)["body.@id"].dropna().unique()
    comentioned_entities = set(comentioned_entities)
    for comentioned_entity in  comentioned_entities:
        if comentioned_entity not in comentioned_dict:
            comentioned_dict[comentioned_entity] = []
        comentioned_dict[comentioned_entity].append((paper_id,comentioned_entities))


for ss in comentioned_dict.keys():
    for aPaper, co_mentioned_entities in comentioned_dict[str(ss)]:
        for co_mentioned in co_mentioned_entities:
            if ss != co_mentioned:
                if (rdflib.term.URIRef(co_mentioned),rdflib.term.URIRef("https://bbp.epfl.ch/nexus/v1/resources/covid19-kg/vocab/comention"),rdflib.term.URIRef(ss), aPaper) not in comention_graph:
                    comention_graph.add((rdflib.term.URIRef(ss),rdflib.term.URIRef("https://bbp.epfl.ch/nexus/v1/resources/covid19-kg/vocab/comention"),rdflib.term.URIRef(co_mentioned),aPaper))
                    
print("Done ...")

In [None]:
%%time
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
import base64
import io
import dash
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import pandas as pd
import dash_cytoscape as cyto

cyto.load_extra_layouts()
def build_cytoscape_elements(comention_graph, content_graph, graph_type="comention"):
    elements = []
    
    G = rdflib_to_networkx_digraph(comention_graph) if graph_type =="comention" else rdflib_to_networkx_digraph(content_graph)
    
    def addNode(id, node_type,label=None, label_size=10, label_color="black", radius=30, node_color='grey',frequency=1, definition="",papers=[]):

        actualLabel = None
        if label is not None:
            actualLabel = label.lower()
        else:
            actualLabel = str(id).lower().split("/")[-1].split("#")[-1]

        elements.append({
            "data": { 
                "id": str(id).lower(),
                "frequency":frequency,
                "definition":definition,
                "papers":papers,
                "type":node_type
            },
            "style": {
                "label": actualLabel,
                "width": radius,
                "height": radius
            }
        })
        
    
    def addEdge(id, from_id, to_id, label=None, label_size=10, label_color="black", thickness=2, edge_color="grey", edge_style="solid",frequency=1,papers=[]):
        
        if thickness == 0:
            thickness = 2
        elements.append({
            "data": { 
                "id": str(id),
                "source": str(from_id).lower(),
                "target": str(to_id).lower(),
                "frequency":frequency,
                "papers":papers
            },
            "style": {
               "label": label if label else '',
                "width": thickness
            }
        })
        
        

    for node, node_attrs in G.nodes(data=True):
        if (str(node).startswith("http")) and not (str(node).startswith('https://bbp.epfl.ch/nexus/v1/resources/covid19-kg/schemas/')):
            node_label = content_graph.label(node,str(node).split("/")[-1].split("#")[-1])
            node_type = content_graph.value(node,RDF.type, default="",any=True).split("/")[-1].split("#")[-1]
            node_definition = content_graph.value(node,SKOS.definition, default="", any=True)
    
            node_radius = 5
            frequency=1
            
     
            node_papers = enriched_entity_stats[str(node)]
            frequency = len(node_papers)
            if frequency >= 1:
                node_radius = frequency * node_radius
                addNode(str(node), node_type,label=node_label,radius=node_radius, frequency=frequency,node_color="lightblue", label_color='blue',definition=node_definition, papers = node_papers)
       
    
    
    for source, target, edge_attrs in G.edges(data=True):
        if not 'value' in edge_attrs and not 'width' in edge_attrs and 'weight' in edge_attrs:
            edge_attrs['value'] = edge_attrs['weight']
        if 'triples' in edge_attrs:
            edge_attrs['title'] = edge_attrs['triples'][0][1]
        edge_id = str(source).lower().replace(" ","_")+"_"+str(target).lower()
        edge_label = str(edge_attrs['title']).split("/")[-1].split("#")[-1]
        
        if edge_label != "label" and edge_label != "definition" and edge_label != "type":
            thickness = 2
            edge_papers = set()
            
            if graph_type == "comention":
                for q in comention_graph.quads((rdflib.term.URIRef(source),rdflib.term.URIRef('https://bbp.epfl.ch/nexus/v1/resources/covid19-kg/vocab/comention'),rdflib.term.URIRef(target),None)):
                    edge_papers.add(q[3].identifier)
                for q in comention_graph.quads((rdflib.term.URIRef(target),rdflib.term.URIRef('https://bbp.epfl.ch/nexus/v1/resources/covid19-kg/vocab/comention'),rdflib.term.URIRef(source),None)):
                    edge_papers.add(q[3].identifier)
            else:
                edge_papers = relation_stats[edge_labelge]
                
            thickness = thickness * len(edge_papers)
          
            if len(edge_papers) >= 1:
                addEdge(
                        id = edge_id, 
                        from_id = str(source), 
                        to_id = str(target), 
                        label=None if graph_type == "comention" else edge_label,
                        label_size=6,
                        thickness=thickness, 
                         edge_color="lightgrey",
                        frequency=len(edge_papers),
                        papers = list(edge_papers)
                       )
  
    return elements, G

print("Generating Knowledge Graph and Co-mention visualisation data ...")
comention_graph_cyto_elements, G = build_cytoscape_elements(comention_graph, content_graph, graph_type="comention")
knowledge_graph_cyto_elements, G = build_cytoscape_elements(comention_graph, content_graph, graph_type="kg")

comention_graph_cyto_elements_dict = {elt['data']['id']:elt for elt in comention_graph_cyto_elements}
knowledge_graph_cyto_elements_dict= {elt['data']['id']:elt for elt in knowledge_graph_cyto_elements}
print("Done.")

In [None]:
# ################################# Graph LAYOUT Definition ################################

node_shape_option_list = ['ellipse',
                                'triangle',
                                'rectangle',
                                'diamond',
                                'pentagon',
                                'hexagon',
                                'heptagon',
                                'octagon',
                                'star',
                                'polygon']

dropdown_download_option_list = [
                                    'jpg',
                                    'png',
                                    'svg'
                                ]

graph_layout_option_list = ['random',
                                'grid',
                                'circle',
                                'concentric',
                                'breadthfirst',
                                'cose',
                                'cose-bilkent',
                                'dagre',
                                'cola',
                                'klay',
                                'spread',
                                'euler']

graph_type_option_list = ['Knowledge Graph', 'Co-mention Graph']


button_group = dbc.InputGroup(
    [
                        dbc.Button("Reset", color="primary", className="mr-1",id='bt-reset'),
                        dbc.Button("Remove Selected Node", color="primary", className="mr-1",id='remove-button'),
                        dbc.DropdownMenu(
                            [
                             dbc.DropdownMenuItem("png", id="png-menu"),
                                dbc.DropdownMenuItem(divider=True),
                             dbc.DropdownMenuItem("jpg", id="jpg-menu"),
                                 dbc.DropdownMenuItem(divider=True),
                             dbc.DropdownMenuItem("svg", id="svg-menu")
                            ],
                            label="Download",
                            id='dropdown-download',
                            
                            color="primary",
                            group=True,
                            className="mr-1"
                        )
    ]
)

buttons = dbc.FormGroup(
            [
                 button_group
            ],className="mr-1"
        )

radios_input = dbc.FormGroup(
    [
        dbc.Label("Display", html_for="showgraph", width=3),
        dbc.Col(
            dbc.RadioItems(
                id="showgraph",
                value='Co-mention Graph',
                options=[{'label': val.capitalize(), 'value': val} for val in graph_type_option_list]
            ), width=9
        )
    ],
    row=True
)







input_group = dbc.InputGroup(
    [
        dbc.InputGroupAddon(
            "Search",
            addon_type="prepend",
        ),
         dcc.Dropdown(
            id="searchdropdown",
            multi=True,
             style={
                 "width":"80%"
             }
             
        )
        
    ],
    className="mb-3"
)


search = dbc.FormGroup(
    [
        dbc.Label("Search", html_for="searchdropdown", width=3),
        dbc.Col(dcc.Dropdown(
            id="searchdropdown",
            multi=True
        ), width=9)
        
    ],
    
    row=True
)




node_slider = dbc.InputGroup(
    [
        dbc.InputGroupAddon(
            dbc.Button("Node frequency", color="primary"),
            addon_type="prepend",
            className="mr-1"
        ),
        dcc.Dropdown(
            id='node-freq-filter',
            value="ge",
            clearable=False,
            options = dropdown_freq_filter_list,
            
            className="mr-1"
        ),
        daq.NumericInput(
            id="nodefreqslider",
            min=1,  
            max=10000,
            value=1,
           className="mr-1"
        )
        
       
    ],
    className="mb-3"
    
)


item_details = dbc.FormGroup(
    [
        
        html.Div(id="modal")
    ]
)


item_details_card = dbc.Card(
                        dbc.CardBody(
                            [
                                html.H5("therapeutic insulin", className="card-title"),
                                html.H6("PROTEIN", className="card-subtitle"),
                                html.P(
                                    "A synthetic or animal-derived form of insulin used in the treatment of diabetes mellitus. Therapeutic insulin is formulated to be short-, intermediate- and long-acting in order to individualize an insulin regimen according to individual differences in glucose and insulin metabolism. Therapeutic insulin may be derived from porcine, bovine or recombinant sources. Endogenous human insulin, a pancreatic hormone composed of two polypeptide chains, is important for the normal metabolism of carbohydrates, proteins and fats and has anabolic effects on many types of tissues.",
                                    className="card-text"
                                ),
                                dbc.Button("See more", color="primary", id ="see-more-card")
                            ],
                            id = "item-card-body"
                        )
                    )



form = dbc.Form([button_group, radios_input,search,node_slider,item_details_card])



graph_layout = dbc.FormGroup(
    [
        dbc.Label("Layout", html_for="searchdropdown", width=3),
        dbc.Col(dcc.Dropdown(
            id ='dropdown-layout',
            options = [{'label': val.capitalize(), 'value': val} for val in graph_layout_option_list],
            value='circle',
            clearable=False
        ), width=9)
        
    ],
    row=True
)

node_shape = dbc.FormGroup(
    [
        dbc.Label("Node Shape", html_for="dropdown-node-shape", width=3),
        dbc.Col(dcc.Dropdown(
            id='dropdown-node-shape',
            value='ellipse',
            clearable=False,
            options = [{'label': val.capitalize(), 'value': val} for val in node_shape_option_list]
        ), width=9)
        
    ],
    row=True
)

link_color_picker = dbc.FormGroup(
    [
        dbc.Col(daq.ColorPicker(
          id='input-follower-color',
          value=dict(hex='#a0b3dc'),
          label="Edge Color"
        ))    
    ],
    row=True
)


conf_form =dbc.Form([graph_layout,node_shape,link_color_picker])


In [None]:
import json
import os

import dash
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc



import dash_cytoscape as cyto

from dash.exceptions import PreventUpdate

def load_json(st):
    if 'http' in st:
        return requests.get(st).json()
    else:
        with open(st, 'rb') as f:
            x = json.load(f)
        return x
    
# Load extra layouts
cyto.load_extra_layouts()
app_tab =  JupyterDash("allvis")

app_tab.add_bootstrap_links = True
app_tab.external_stylesheets=dbc.themes.CYBORG

server = app_tab.server


CONTENT_STYLE = {
    
    "width": "70%",
    "top": "0px",
    "left":"0px",
    "bottom": "0px",
    "position": "fixed",

    }

colors = {
    "CHEMICAL":"green",
    "PROTEIN":"#469d8c",
    "DISEASE":"#dceef1",
    "CELL_TYPE":"#f1d2b5"
}


                
                
cystoscape_STYLE_stylesheet = [
    {
        "selector":'cytoscape',
        "style": {
            "width": "100%",
            "height": "100%"
        }
    },
    {
        "selector": 'node[type = "CHEMICAL"]',
        "style": {"background-color": colors["CHEMICAL"]},
    },{
        "selector": 'node',
        'style': {
            "font-size": 100,
            "font-weight":"bold",
            "text-valign": "center",
            "text-halign": "center",
            "text-outline-color": "#000000",
            "text-outline-width": "2px",
            "color": "black",
            "overlay-padding": "6px",
            "z-index": "10"
        }
    },{
        "selector": 'edge',
        "style": {
            'curve-style': 'bezier',
            'line-color': '#D5DAE6'
        }
    },{
        "selector": 'node[type = "PROTEIN"]',
        "style": {"background-color": colors["PROTEIN"]},
    },{
        "selector": 'node[type = "DISEASE"]',
        "style": {"background-color": colors["DISEASE"]},
    },{
        "selector": 'node[type = "CELL_TYPE"]',
        "style": {"background-color": colors["CELL_TYPE"]},
    }]
  


app_tab.layout  = html.Div(  
    [
         dcc.Store(id='memory',data={"removed":[]}),
       
    dbc.Row(
            [
                dbc.Col(
                
                html.Div( style=CONTENT_STYLE, children=[
                    cyto.Cytoscape(
                        id='cytoscape',
                        elements=comention_graph_cyto_elements,
                        stylesheet=cystoscape_STYLE_stylesheet,
                        style= {
                                "width": "100%",
                                "height": "100%"
                        }
                    )
                ]),
                    width=8
                ),
                dbc.Col(
                    
                    html.Div( children=[
                        dbc.Tabs(id='tabs', children=[
                            dbc.Tab(label='Details', label_style={"color": "#00AEF9", "border-radius":"4px"},children=[
                                
                                dbc.Card(
                                    dbc.CardBody(
                                        [
                                            form
                                        ]
                                    )
                                )
                                
                                
                            ]),
                            dbc.Tab(label='Graph Layout and Shape', label_style={"color": "#00AEF9"}, children=[
                                dbc.Card(
                                    dbc.CardBody(
                                        [
                                            conf_form
                                        ]
                                    )
                                )
                            ])
                        ]),
                    ]),
                    width=4
                )
            ]
        )
    ]
    )
    


# ############################## CALLBACKS ####################################


@app_tab.callback(
    Output("modal-body-scroll", "is_open"),
    [
        Input("open-body-scroll", "n_clicks"),
        Input("close-body-scroll", "n_clicks"),
    ],
    [State("modal-body-scroll", "is_open")],
)
def toggle_modal(n1, n2, is_open):
    if n1 or n2:
        return not is_open
    return is_open


@app_tab.callback(
    Output("searchdropdown", "options"),
    [Input("searchdropdown", "search_value")],
    [State("searchdropdown", "value"),
    State('cytoscape', 'elements')],
)
def update_multi_options(search_value, value,elements):
    
    if not search_value:
        raise PreventUpdate
    

    res = []
    for ele_data in elements:
        
        if 'label' in ele_data['style']:
            label =ele_data['style']['label']
           
            if (search_value in label) or (label in search_value) or ele_data['data']['id'] in (value or []) :

                #ele_data["selected"]=True
                res.append( {"label":ele_data['style']['label'],"value":ele_data['data']['id']})
  
    return res



@app_tab.callback(Output('nodefreqslider', 'value'),
              [Input('bt-reset', 'n_clicks')],[State('nodefreqslider', 'value')])
def display_freq_node(resetbt, nodefreqslider):
    
    
    ctx = dash.callback_context

    if not ctx.triggered:
        button_id = 'No clicks yet'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
        
    if button_id == 'bt-reset':
        return 1

@app_tab.callback(
    [
        Output('cytoscape', 'generateImage')
    ],
    [
        Input('jpg-menu', 'n_clicks'),
        Input('svg-menu', 'n_clicks'),
        Input('png-menu', 'n_clicks')
    ]
)
def download_image(jpg_menu,svg_menu,png_menu):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'No clicks yet'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    ftype  = None
    if button_id == "png-menu":
        ftype = "png"
    if button_id == "jpg-menu":
        ftype = "jpg"
    if button_id == "svg-menu":
        ftype = "svg"
    return [{
        'type': ftype,
        'action': "download"
    }]

removed = set()
from sqlalchemy.sql import select
from sqlalchemy.sql import and_, or_, not_

def list_papers (papers):
    META_DATA = sqlalchemy.MetaData(bind=engine, reflect=True)
    articles = META_DATA.tables["articles"]
    clauses = or_( *[articles.c.article_id==x for x in papers] )
    s = select([articles.c.title,articles.c.authors,articles.c.abstract,articles.c.doi,articles.c.url,articles.c.journal,articles.c.pmcid,articles.c.pubmed_id,articles.c.publish_time]).where(
       clauses
       )
    result = engine.execute(s)
    results = []
    for row in result:
        results.append(row)
    return results

@app_tab.callback(
    [
        Output('cytoscape', 'zoom'),
        Output('cytoscape', 'elements')
    ],
    [
        Input('bt-reset', 'n_clicks'),
        Input('remove-button', 'n_clicks'),
        Input('showgraph', 'value'),
        Input('nodefreqslider', 'value'),
        Input('node-freq-filter', 'value'),
        Input("searchdropdown", "value")
        
     ],
     [
        State('cytoscape', 'elements'),
        State('cytoscape', 'selectedNodeData'),
        State('cytoscape', 'selectedEdgeData'),
        State('cytoscape', 'tapNodeData'),
        State('cytoscape', 'zoom'),
         State('nodefreqslider', 'value')
        
      ]
)

def reset_layout(resetbt, removebt, val, nodefreqslider, node_freq_operator,searchvalues,cytoelements, data, edge,tappednode,zoom,nodefreqsliderstate):
    global removed 
    elements = cytoelements
    elements_dict  = {elt['data']['id']:elt for elt in cytoelements}
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'No clicks yet'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    if button_id == 'showgraph':
        if val == 'Knowledge Graph':
            elements = knowledge_graph_cyto_elements
            elements_dict = knowledge_graph_cyto_elements_dict
        if val == 'Co-mention Graph':
            elements = comention_graph_cyto_elements
            elements_dict = comention_graph_cyto_elements_dict

    if searchvalues is not None:
        for searchvalue in searchvalues:
            search_node = elements_dict[searchvalue]
            search_node["selected"]=True
    
    if nodefreqslider == 1:
        if val == 'Knowledge Graph':
            elements = knowledge_graph_cyto_elements
            elements_dict = knowledge_graph_cyto_elements_dict
        if val == 'Co-mention Graph':
            elements = comention_graph_cyto_elements
            elements_dict = comention_graph_cyto_elements_dict
        zoom =1
        global removed
        removed = set()

    if button_id == 'remove-button':
        if elements and data:
            ids_to_remove = {ele_data['id'] for ele_data in data}
        if elements and edge:
            ids_to_remove = {ele_data['id'] for ele_data in edge}
            
        elements = [ele for ele in elements if ele['data']['id'] not in ids_to_remove]

        removed.update(ids_to_remove)

   
    if elements and (nodefreqslider is not None and button_id == 'nodefreqslider') :
        
        if val == 'Knowledge Graph':
            elements = knowledge_graph_cyto_elements
        if val == 'Co-mention Graph':
            elements = comention_graph_cyto_elements
       
        ids_to_remove = [ele_data['data']['id'] for ele_data in elements if 'source' not in ele_data["data"] and ele_data["data"]["id"] not in removed and 'frequency' in ele_data['data'] and ele_data['data']['frequency'] is not None and not eval(node_freq_operator)(int(ele_data['data']['frequency']), int(nodefreqslider))]
       
        elements = [ele for ele in elements if ele['data']['id'] not in ids_to_remove]
  
    return zoom, elements




@app_tab.callback([Output('item-card-body', 'children')],
                  [Input('cytoscape', 'tapNode'),Input('cytoscape', 'tapEdge')],
                  [State('cytoscape', 'selectedNodeData'),State('cytoscape', 'selectedEdgeData')])
def display_tap_node(datanode, dataedge,statedatanode,statedataedge):
        

    papers = []
    res = []
    modal_button = None
    if datanode and statedatanode:
        definition = ""
        if 'definition' in str(datanode['data']):
            definition = str(datanode['data']['definition'])
        label = str(datanode['style']['label'])
        _type = str(datanode['data']['type'])
        frequency = str(datanode['data']['frequency'])
        res.append([
            html.H5(label, className="card-title"),
            html.H6(_type, className="card-subtitle"),
            html.P(
                definition,
                className="card-text"
            )
            
        ])
        label = "'"+label+"' mentioned in "+frequency+" papers"
        modal_button = dbc.Button(label, id="open-body-scroll",color="primary")
        
        papers= datanode['data']['papers']

        
    if dataedge and statedataedge:
        label = str(dataedge['style']['label'])
        
        source_node = comention_graph_cyto_elements_dict[ dataedge['data']['source']]
        source_label = source_node['style']['label']
        target_node = comention_graph_cyto_elements_dict[ dataedge['data']['target']]
        target_label = target_node['style']['label']
        frequency = str(dataedge['data']['frequency'])
        mention_label= ''' '%s' mentioned in %s papers with '%s' ''' % (source_label, frequency, target_label) 
        label = mention_label if str(dataedge['style']['label']) == "" else str(dataedge['style']['label']) 
        modal_button= dbc.Button(label, id="open-body-scroll",color="primary")
    
        papers= dataedge['data']['papers']
       
    if len(papers) > 0:
        papers_in_kg = list_papers(papers)

       
        rows = []
        
        if papers_in_kg:
            for paper in papers_in_kg:
                title = paper[0] if paper[0] else ''
                authors = paper[1] if paper[1] else ''
                abstract = paper[2] if paper[2] else ''
                journal = paper[5] if paper[5] else ''
                url = paper[4] if paper[4] else ''
                publish_time = str(paper[8]) if paper[8] else ''

                abstract = (abstract[:500] + '...') if abstract and len(abstract) > 500 else abstract
                
                paper_card = dbc.Card(
                                    dbc.CardBody(
                                        [
                                            html.H4(title, className="card-title"),
                                            html.H5(authors, className="card-subtitle"),
                                            
                                            html.H6(journal+"( "+publish_time+" )", className="card-subtitle"),
                                            html.P(
                                                abstract,
                                                className="card-text"
                                            ),
                                            dbc.Button("View the paper", href=url,target="_blank",color="primary"),
                                        ]
                                    )
                                )
                rows.append(paper_card)

            cards = dbc.Row(rows)        

            modal = html.Div(
            [
                modal_button,

                dbc.Modal(
                    [
                        dbc.ModalHeader(label),
                        dbc.ModalBody(cards),            
                        dbc.ModalFooter(
                            dbc.Button(
                                "Close", id="close-body-scroll", className="ml-auto"
                            )
                        ),
                    ],
                    id="modal-body-scroll",
                    scrollable=True,
                    size="lg"
                ),
            ]
            )
            if len(res) > 0:
                res[0].append(modal)
            else:
                res.append(modal)
    else:
        
        res = [html.H5("Select an item for details", className="card-title")]
    
    
    return res



@app_tab.callback(Output('cytoscape', 'layout'),
              [Input('dropdown-layout', 'value')])
def update_cytoscape_layout(layout):
    return {
        'name': layout,
        'showlegend':True
        
    }



@app_tab.callback(Output('cytoscape', 'stylesheet'),
                  [Input('cytoscape', 'tapNode'),
                   Input('cytoscape', 'selectedNodeData'),
                   Input('input-follower-color', 'value'),
                   Input('dropdown-node-shape', 'value'),
                   Input('showgraph', 'value')],
                   [State('cytoscape', 'stylesheet')])
def generate_stylesheet(node, selectedNodes,follower_color, node_shape, graphtype, original_stylesheet):
    if not graphtype or not node:
        return original_stylesheet
    
    
    focus_nodes = []
    
    if selectedNodes:
        
        for selectedNode in selectedNodes:
            focus_nodes.append(selectedNode)
            
    if node is not None:
        focus_nodes.append(node)

    stylesheet  = original_stylesheet
    for focus_node in focus_nodes:      
        node_style = [
                    {
            "selector": 'node',
            'style': {

                'shape': node_shape,
                 "font-size": 100

            }
                    }, {
              "selector": "node:selected",
              "style": {
                "border-width": "50px",
                "border-color": "#AAD8FF",
                "border-opacity": "0.5"
              }
            }, 
            {
        "selector": 'edge',
        "style": {
            'curve-style': 'bezier',
            'line-color': '#D5DAE6'
        }
    },{
            "selector": 'node[id = "{}"]'.format(focus_node['data']['id'] if "data" in focus_node else focus_node['id']),
            "style": {
                "border-width": "50px",
                "border-color": "#AAD8FF",
                "border-opacity": "0.5",
                "text-opacity": 1,
                "font-size": 100,
                'z-index': 9999
            }
        }]
        for style in node_style:
            stylesheet.append(style)
        
        
        if "edgesData" in focus_node:
            for edge in focus_node['edgesData']:
                if edge['source'] == focus_node['data']['id'] if "data" in focus_node else focus_node['id']:
                    stylesheet.append({
                        "selector": 'node[id = "{}"]'.format(edge['target']),
                        "style": {
                            #'background-color': following_color,
                            'opacity': 0.9
                        }
                    })
                    stylesheet.append({
                        "selector": 'edge[id= "{}"]'.format(edge['id']),
                        "style": {
                            "mid-target-arrow-color": follower_color['hex'],
                            #"mid-target-arrow-shape": "vee",
                            "line-color": follower_color['hex'],
                            'opacity': 0.9,
                            'z-index': 5000
                        }
                    })
                #print(follower_color)
                if edge['target'] == focus_node['data']['id'] if "data" in focus_node else focus_node['id']:
                    stylesheet.append({
                        "selector": 'node[id = "{}"]'.format(edge['source']),
                        "style": {
                           # 'background-color': follower_color,
                            'opacity': 0.9,
                            'z-index': 9999
                        }
                    })
                    stylesheet.append({
                        "selector": 'edge[id= "{}"]'.format(edge['id']),
                        "style": {
                            "mid-target-arrow-color": follower_color['hex'],
                            "line-color": follower_color['hex'],
                            'opacity': 1,
                            'z-index': 5000
                        }
                    })
   
    return stylesheet


app_tab.config['suppress_callback_exceptions']=True
app_tab.width = "100%"
app_tab.height = "800px"
app_tab.run_server(mode="jupyterlab", port="8072")

# Version the knowledge graph
The user can save a knowledge graph with a version.

In [None]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

# Temporally save the knowledge graph locally
kg_ttl = knowledge_graph.serialize(format="turtle",auto_compact=True)
kg_ttl_filename = "./kg_%s.ttl" % (timestr)
with open(kg_ttl_filename, 'wb') as outfile:
        outfile.write(kg_ttl)

        
# Temporally save the extracted entities csv file locally
table_extractions_filename = "./table_extractions_%s.csv" % (timestr)
table_extractions.to_csv(table_extractions_filename)


# Temporally save the curated list of extracted entities csv file locally
curated_table_extractions_filename = "./curated_table_extractions_%s.csv" % (timestr)
curated_table_extractions.to_csv(curated_table_extractions_filename)

In [None]:
import jwt
from kgforge.core import Resource
from kgforge.specializations.resources import Dataset

agent = jwt.decode(TOKEN,  verify=False)

agent = forge.reshape(forge.from_json(agent), keep=["name","email","sub","preferred_username"])
agent.id = agent.sub
agent.type = "Person"

dataset = Dataset(forge,name="A dataset", about=topic_resource.name)
dataset.add_distribution(kg_ttl_filename, content_type="application/x-turtle")
dataset.add_distribution(table_extractions_filename, content_type="application/csv")
dataset.add_distribution(curated_table_extractions_filename, content_type="application/csv")
dataset.add_contribution(agent)
dataset.contribution.hadRole= "Scientists"

In [None]:
version = agent.preferred_username+"_"+timestr

def register_dataset(b):
    output4.clear_output()
    output5.clear_output()
    dataset.name = t1.value
    dataset.description = t2.value
    forge.register(dataset)
    if dataset._last_action.succeeded == True:
        with output4:
            print("Dataset registered!")
    else:
        with output4:
            print(dataset._last_action.message)

def version_dataset(b):
    output5.clear_output()
    version = t3.value
    forge.tag(dataset,version)
    if dataset._last_action.succeeded == True:
        with output5:
            print(f"Tagged with: {str(version)}")
    
output4 = ipywidgets.Output()
output5 = ipywidgets.Output()

b1 = ipywidgets.Button(
    description= '💾  Register Dataset',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)

b2 = ipywidgets.Button(
    description= '🔖 Tag Dataset',
    button_style='',
    layout=ipywidgets.Layout(width='300px', height='30px'),
    disabled=False)

t1 = ipywidgets.Text(
    placeholder='Add a name for your dataset',
    description='Name:',
    disabled=False)

t2 = ipywidgets.Textarea(
    placeholder='Add a description of your dataset',
    description='Description:',
    disabled=False)

t3 = ipywidgets.Text(
    description='Tag:',
    value=version,
    disabled=False)

b1.on_click(register_dataset)
b2.on_click(version_dataset)

save_widget = ipywidgets.VBox(children=[t1, t2, b1, output4, t3, b2, output5])

display(save_widget)