# Goal of the notebook
(to be completed)

In [None]:
from collections import OrderedDict
import logging
import os
from pathlib import Path
import sqlite3

import IPython
import ipywidgets
import numpy as np
import pandas as pd
import requests
import scispacy
import spacy
import nltk

from bbsearch.article_saver import ArticleSaver
from bbsearch.remote_searcher import RemoteSearcher
from bbsearch.widget import Widget

# Set a Project
The User choses/creates a project to host a KG.

# Set topic
The user defines its topic.

# Data Import
The user loads data from a data source (CORD-19).
The loaded data forms the corpus.
The user searches the CORPUS in Blue Brain Search.

In [None]:
SEARCH_ENGINE_URL = os.getenv("SEARCH_ENGINE_URL")
BBS_DATA_PATH = Path(os.getenv("BBS_DATA_PATH")) if "BBS_DATA_PATH" in os.environ else Path('/raid/bbs_data/')
CORD19_VERSION = 'v7'

cord_path = BBS_DATA_PATH / f'cord19_{CORD19_VERSION}'
db_path = cord_path / 'databases' / 'cord19.db'
trained_models_path = BBS_DATA_PATH / 'trained_models'

assert db_path.is_file()
assert SEARCH_ENGINE_URL is not None

In [None]:
response = requests.post("{}/hello".format(SEARCH_ENGINE_URL))
assert response.ok and response.json(), "The server is not accessible"

searcher = RemoteSearcher(SEARCH_ENGINE_URL)

database = sqlite3.connect(str(db_path))

In [None]:
article_saver = ArticleSaver(database=database)

In [None]:
bbs_widget = Widget(searcher, database, article_saver=article_saver)
bbs_widget.display()

Status of the Article Saver

In [None]:
table = article_saver.summary_table()
display(table)

# Set schemas
The user defines the KG schema.

# Create a knowledge graph according to schemas
The user extracts data from the text of a set of papers using selected Named Entity Recognizers and Relation Extractors from Blue Brain Search.
The user can preview the extracted data.
The user curates extracted data.
The user links the extracted entities and relations to ontologies.
The user saves data into Knowledge Graph.

- **input**: raw text
- **output**: csv table of extracted entities/relations

In [None]:
DEFAULT_TEXT = """Autophagy maintains tumour growth through circulating
arginine. Autophagy captures intracellular components and delivers them to
lysosomes, where they are degraded and recycled to sustain metabolism and to
enable survival during starvation. Acute, whole-body deletion of the essential 
autophagy gene Atg7 in adult mice causes a systemic metabolic defect that 
manifests as starvation intolerance and gradual loss of white adipose tissue, 
liver glycogen and muscle mass.  Cancer cells also benefit from autophagy. 
Deletion of essential autophagy genes impairs the metabolism, proliferation, 
survival and malignancy of spontaneous tumours in models of autochthonous 
cancer. Acute, systemic deletion of Atg7 or acute, systemic expression of a 
dominant-negative ATG4b in mice induces greater regression of KRAS-driven 
cancers than does tumour-specific autophagy deletion, which suggests that host 
autophagy promotes tumour growth.
""".replace('\n', ' ').replace('  ', ' ')

In [None]:
def get_mining_pipeline(mining_server_url):
    
    mining_server = False
    
    if mining_server_url is not None:
        
        import io
        
        response = requests.post(mining_server_url + "/help")
        
        if response.ok and response.json()['name'] == 'MiningServer':
            mining_server = True
            print('MiningServer')
            
            def textmining_pipeline(text, article_id=None, return_prob=False, debug=False):
                request_json = {
                    "text": text,
                    "article_id": article_id,
                    "return_prob": return_prob,
                    "debug": debug,
                }
                response = requests.post(mining_server_url, json=request_json)
                if response.headers["Content-Type"] == "text/csv":
                    with io.StringIO(response.text) as f:
                        table_extractions = pd.read_csv(f)
                else:
                    print("Response content type is not text/csv.")
                    
                return table_extractions
         
    if not mining_server:
        print('Local Implementation')
        
        from bbsearch.mining import ChemProt, TextMiningPipeline
        
        # Entities Extractors (EE)
        ee_model = spacy.load("en_ner_craft_md")

        # Relations Extractors (RE)
        PATH_CHEMPROT_TRAINED_MODEL = trained_models_path / 'scibert_chemprot.tar.gz'
        re_models = {('CHEBI', 'GGP'): [ChemProt(PATH_CHEMPROT_TRAINED_MODEL)]}

        # Full Pipeline
        text_mining_pipeline = TextMiningPipeline(ee_model, re_models)
        
        def textmining_pipeline(text, article_id=None, return_prob=False, debug=False):
            table_extractions = text_mining_pipeline(text, article_id, return_prob=return_prob, debug=debug)
            return table_extractions
        
    return textmining_pipeline

mining_server_url = 'http://dgx1.bbp.epfl.ch:8852'
text_mining = get_mining_pipeline(mining_server_url)

In [None]:
# This is the output: csv table of extracted entities/relations.
table_extractions = None

In [None]:
# Define Widgets
bbs_widgets = OrderedDict()

In [None]:
bbs_widgets['articles_button'] = ipywidgets.Button(
    description='Mine Selected Articles!',
    layout=ipywidgets.Layout(width='60%')
)

def article_button(b):
    global table_extractions
    bbs_widgets['out'].clear_output()
    complete_text = ''
    with bbs_widgets['out']:
        article_saver.retrieve_text()
        table_extractions = pd.DataFrame()
        for article_id, section_name, paragraph_id, text \
                  in article_saver.df_chosen_texts[['article_id', 'section_name', 'paragraph_id', 'text']].values:
            text_identifier = f'{article_id}:"{section_name}":{paragraph_id}'
            table_extractions = table_extractions.append(
                text_mining(text, article_id=text_identifier, return_prob=True), 
                ignore_index=True)
        display(table_extractions)
        
bbs_widgets['articles_button'].on_click(article_button)

In [None]:
# "Input Text" Widget
bbs_widgets['input_text'] = ipywidgets.Textarea(
        value=DEFAULT_TEXT,
        layout=ipywidgets.Layout(width='75%', height='300px')
    )

# "Submit!" Button
bbs_widgets['submit_button'] = ipywidgets.Button(
    description='Mine This Text!',
    layout=ipywidgets.Layout(width='30%')
)
def cb(b):
    global table_extractions
    bbs_widgets['out'].clear_output()
    with bbs_widgets['out']:
        text = bbs_widgets['input_text'].value
        table_extractions = text_mining(text, return_prob=True)
        display(table_extractions)
bbs_widgets['submit_button'].on_click(cb)

# "Output Area" Widget
bbs_widgets['out'] = ipywidgets.Output(layout={'border': '0.5px solid black'})

# Finalize Widgets
ordered_widgets = list(bbs_widgets.values())
main_widget = ipywidgets.VBox(ordered_widgets)
IPython.display.display(main_widget)

- **input**: csv table of extracted entities/relations
- **output**: knowledge graph

In [None]:
import pandas as pd

if globals().get('table_extractions') is None:
    ! wget -O extractions_example.csv 'https://drive.google.com/uc?export=download&id=11BBtkKsamru4kjUNev8lO_ulNMf7m3Ta'
    table_extractions = pd.read_csv('extractions_example.csv', sep=';')

In [None]:
print(f'The table has {table_extractions.shape[0]} rows.')

In [None]:
from typing import Iterator, Dict
import pandas as pd

def represent_as_annotations(df: pd.DataFrame) -> Iterator[Dict]:
    def _(row):
        return {
            '@context': 'http://www.w3.org/ns/anno.jsonld',
            '@id': f'https://bbp.epfl.ch/covid19/{row.Index}',
            '@type': 'Annotation',
            'target': {
                'source': row.paper_id,
                'selector': {
                    '@type': 'TextPositionSelector',
                    'start': row.start,
                    'end': row.end,
                    'value': row.entity,
                },
            },
        }
    return (_(x) for x in df.itertuples())

In [None]:
annotations = list(represent_as_annotations(table_extractions))

In [None]:
import pickle

# Note: The file is around 26 MB.
! wget -O entity_linking-terms.pkl 'https://drive.google.com/uc?export=download&id=1DyA9WL1YpEBO37KkDSCY3f1LWFfscO7F'

with open('entity_linking-terms.pkl', 'rb') as f:
    ontology = pickle.load(f)

ontologies = ['ChEBI', 'SO']

In [None]:
print(f'The entity linking considers {len(ontology)} terms from:', *ontologies, sep='\n - ')

In [None]:
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

class Candidate:
    
    def __init__(self, identifier, term, definition, score):
        self.score = score
        self.identifier = identifier
        self.term = term
        self.definition = definition
    
    def __repr__(self):
        attrs = (f"{k}={v!r}" for k, v in self.__dict__.items())
        return f"Candidate({', '.join(attrs)})"

class EntityLinker:
    
    def __init__(self, ontology, nlp):
        self.terms = None
        self.model = None
        self.index = None
        self.ontology = [(k, v[0], v[1]) for k, v in ontology.items()]
        self.nlp = nlp
    
    def link(self, mentions):
        selections = self.candidates(mentions)
        return [self.disambiguate(mention, candidates)
                for mention, candidates in zip(mentions, selections)]
    
    def disambiguate(self, mention, candidates, threshold=0.6):
        def _(cands):
            # FIXME Used as a proxy until this is done during the other NLP operations.
            doc = self.nlp(mention)
            similarities = [(doc.similarity(self.nlp(x.definition.replace(mention, ''))), x)
                            for x in cands if x.definition]
            ranked = sorted(similarities, key=lambda x: x[0], reverse=True)
            return ranked[0]
        zeros = [x for x in candidates if x.score == 0]
        if zeros:
            chosen = _(zeros)
            return chosen
        else:
            chosen = _(candidates)
            return chosen if chosen[0] >= threshold else None
    
    def candidates(self, mentions, limit=3):
        embeddings = self.model.transform(mentions)
        distances, indexes = self.index.search(embeddings.toarray(), limit)
        return [[Candidate(*self.terms[i], d) for i, d in zip(indexes[k], distances[k])]
                for k in range(len(mentions))]
    
    def train(self):
        self.model = TfidfVectorizer(analyzer='char', ngram_range=(3, 3), max_df=0.95,
                                     max_features=int(len(self.ontology)*0.1),
                                     dtype=np.float32, norm='l2')
        terms = [x for _, x, _ in self.ontology]
        embeddings = self.model.fit_transform(terms)
        flags = np.array(embeddings.sum(axis=1) != 0).reshape(-1)
        filtered_embeddings = embeddings[flags]
        self.terms = [term for term, flag in zip(self.ontology, flags) if flag]
        self.index = faiss.IndexFlatL2(filtered_embeddings.shape[1])
        self.index.add(filtered_embeddings.toarray())

In [None]:
try:
    nlp = ee_model
except NameError:
    import spacy
    nlp = spacy.load('en_ner_craft_md')

In [None]:
%%time
linker = EntityLinker(ontology, nlp)
linker.train()
# Note: Takes around 45 secs on a BBP issued MacBook Pro.

In [None]:
from typing import Iterable, Dict, Iterator
from copy import deepcopy

def enrich_annotations(annotations: Iterable[Dict], linker: EntityLinker) -> Iterator[Dict]:
    def _(ann, can):
        new = deepcopy(ann)
        if can:
            new['body'] = {
                '@id': can[1].identifier,
                'label': can[1].term,
            }
        return new
    mentions = [x['target']['selector']['value'] for x in annotations]
    linked_mentions = linker.link(mentions)
    return (_(ann, can) for ann, can in zip(annotations, linked_mentions))

In [None]:
%%time
enriched_annotations = list(enrich_annotations(annotations, linker))
# Note: Takes around 20 secs on a BBP issued MacBook Pro.

In [None]:
import json
from typing import Iterable, Dict
from rdflib import Graph

def load_knowledge_graph(jsonlds: Iterable[Dict]) -> Graph:
    g = Graph()
    for x in jsonlds:
        g.parse(data=json.dumps(x), format='json-ld')
    return g

In [None]:
%%time
knowledge_graph = load_knowledge_graph(enriched_annotations)
# Note: Takes around 8 secs on a BBP issued MacBook Pro.

In [None]:
print(f'The knowledge graph has {len(knowledge_graph)} triples.')

# Validate the knowledge graph
Thee User reviews content of Knowledge Graph.

# Correct knowledge graph
The correct the Knowledge Graph is errors occur.

# Access the knowledge graph
The user can search, visualize, and export the knowledge graph.

# Version the knowledge graph
The user can save a knowledge graph with a version.