In [None]:
from abc import ABC, abstractmethod
from collections import OrderedDict
import itertools
import pathlib
import sys
sys.path.append('/raid/users/krepl/BlueBrainSearch/submodules/scibert')

from allennlp.predictors import Predictor
import ipywidgets
import IPython
import pandas as pd
import scispacy
from scibert.models.text_classifier import TextClassifier
import spacy

In [None]:
def preprocess_re(doc, sent, ent_1, ent_2, etype_symbols):
    """Preprocess sentence for the SciBERT relation extraction model.
    
    Parameters
    ----------
    doc : spacy.tokens.Doc
        The entire document (input text). Use for absolute referencing.
    
    sent : spacy.tokens.Span
        One sentence from the `doc` where we look for relations.

    ent_1 : spacy.tokens.Span
        The first entity in the sentence. One can get its type by using the `label_` attribute.
    
    ent_2 : spacy.tokens.Span
        The second entity in the sentence.One can get its type by using the `label_` attribute.
    
    etype_symbols: dict
        Keys represent different entity types ("GGP", "CHEBBI") and the values are tuples of size 2.
        Each of these tuples represents the starting, ending symbol to wrap the recognized entity with.
        
    Returns
    -------
    result : str
        String representing an annotated sentence created out of the original one. 
    """
    
    etype_1 = ent_1.label_
    etype_2 = ent_2.label_
    
    if not (etype_1 in etype_symbols and etype_2 in etype_symbols):
        raise ValueError('Please specify the special symbols for both of the entity types.')
    
    tokens = []
    i = sent.start
    while i < sent.end:
        new_token = ' ' # hack to keep the punctuation nice

        if ent_1.start == i:
            start, end = ent_1.start, ent_1.end
            new_token += etype_symbols[etype_1][0] + doc[start:end].text + etype_symbols[etype_1][1]

        elif ent_2.start == i:
            start, end = ent_2.start, ent_2.end
            new_token += etype_symbols[etype_2][0] + doc[start:end].text + etype_symbols[etype_2][1]

        else:
            start, end = i, i + 1
            new_token = doc[i].text if doc[i].is_punct else new_token + doc[i].text

        tokens.append(new_token)
        i += end - start
    
    return ''.join(tokens).strip()

In [None]:
# Helpers
def preprocess_re_OLD(doc, sent, etype_1, etype_2, etype_symbols):
    """Preprocess sentence for the SciBERT relation extraction model.
    
    Given one sentence, one extracts all possible pairs for etype_1 and etype_1
    entities and annotates the original sentence accordingly.
    
    Parameters
    ----------
    doc : spacy.tokens.Doc
        The entire document (input text). Use for absolute referencing.
    
    sent : spacy.tokens.Span
        One sentence from the `doc` where we look for relations.

    etype_1 : str
        Name of the subject entity type - i.e. "GGP". See Spacy and SciSpacy pretrained models.
    
    etype_2 : str
        Name of the object entity type - i.e "CHEBI". See Spacy and SciSpacy pretrained models.
    
    etype_symbols: dict
        Keys represent different entity types ("GGP", "CHEBBI") and the values are tuples of size 2.
        Each of these tuples represents the starting, ending symbol to wrap the recognized entity with.
        
    Returns
    -------
    result : list
        List of strings representing all annotated sentences created out of the original one. Might
        be empty.
    """
    if not (etype_1 in etype_symbols and etype_2 in etype_symbols):
        raise ValueError('Please specify the special symbols for both of the entity types.')
    
    if etype_1 == etype_2:
        raise ValueError('The two entity types need to be different')
    
    entities_1 = {ent for ent in sent.ents if ent.label_ == etype_1}  # remove duplicates
    entities_2 = {ent for ent in sent.ents if ent.label_ == etype_2}  # remove duplicates
    
    all_pairs = itertools.product(entities_1, entities_2)
    
    result = []
    for first, second in all_pairs:
        # add sanity check no overlap (should be impossible)
        
        tokens = []
        i = sent.start

        while i < sent.end:
            new_token = ' ' # hack to keep the punctuation nice
    
            if first.start == i:
                start, end = first.start, first.end
                new_token += etype_symbols[etype_1][0] + doc[start:end].text + etype_symbols[etype_1][1]
    
            elif second.start == i:
                start, end = second.start, second.end
                new_token += etype_symbols[etype_2][0] + doc[start:end].text + etype_symbols[etype_2][1]
            
            else:
                start, end = i, i + 1
                new_token = doc[i].text if doc[i].is_punct else new_token + doc[i].text
             
            tokens.append(new_token)
            i += end - start
    
        result.append(''.join(tokens).strip())
        
    return result

In [None]:
class REModel(ABC):
    """Abstract interface for relationship extraction models.
    
    Inspired by SciBERT.
    """
    @abstractmethod
    def predict(self, preprocessed_sentence):
        """Given an annotated sentence predict the relationship.
        
        Parameters
        ----------
        preprocessed_sentence : str
            Sentence with entities being annotated accordingly.
        
        Returns
        -------
        relation : str
            Relation type.
        """
    
    @property
    @abstractmethod
    def symbols(self):
        """Dictionary mapping entity types to their annotation symbols.
        
        General structure: {'ENTITY_TYPE': ('SYMBOL_LEFT', 'SYMBOL_RIGHT')}
        """
    

class ChemProt(REModel):
    """Pretrained model extracting 13 relations between chemicals and proteins."""

    def __init__(self, model_path):
        self.model_ = Predictor.from_path(model_path, predictor_name='text_classifier')
        self.labels = [
            'INHIBITOR',
            'SUBSTRATE',
            'INDIRECT-DOWNREGULATOR',
            'INDIRECT-UPREGULATOR',
            'ACTIVATOR',
            'ANTAGONIST',
            'PRODUCT-OF',
            'AGONIST',
            'DOWNREGULATOR',
            'UPREGULATOR',
            'AGONIST-ACTIVATOR',
            'SUBSTRATE_PRODUCT-OF',
            'AGONIST-INHIBITOR']
    
    @property
    def symbols(self):
        return {'GGP': ('[[ ', ' ]]'),
                 'CHEBI': ('<< ', ' >>')}

        
    def predict(self, processed_sentence):
        s = pd.Series(self.model_.predict(sentence=processed_sentence)['class_probs'], index=self.labels)

        return s.idxmax()
    
        
    

# BlueBrainSearch to BlueBrainGraph: POC

This notebook shows how from raw text we can apply BlueBrainSearch and then BlueBrainGraph tools in order to generate first a list of extracted objects of interest, and then a knowledge graph out of it.

It is intended to be just a proof of concept of the pipeline.

## BlueBrainSearch

This first part of the pipeline starts with the raw text of a scientific paper as an input, and generates a CSV table out of it. The table contains all the extracted entities and relations that were identified in the text.
- **input**: raw text
- **output**: csv table of extracted entities/relations

In [None]:
# Paths
chemprot_path = pathlib.Path('/raid/users/krepl/BlueBrainSearch/submodules/scibert/training_results/first_simple/model.tar.gz')

# Models
NER_PIPELINE = spacy.load("en_ner_craft_md")
RE_MODELS = {('CHEBI', 'GGP'): ChemProt(chemprot_path)}

# Table headers
HEADERS = ['entity',
           'entity_type',
           'property',
           'property_value',
           'property_type',
           'property_value_type',
           'ontology_source',
           'paper_id',
           'start_pos',
           'end_pos']

In [None]:

bbs_widgets = OrderedDict()
bbs_widgets['input_text'] = ipywidgets.Textarea(
        value="Autophagy maintains tumour growth through circulating arginine. Autophagy captures intracellular components and delivers them to lysosomes, where they are degraded and recycled to sustain metabolism and to enable survival during starvation1-5. Acute, whole-body deletion of the essential autophagy gene Atg7 in adult mice causes a systemic metabolic defect that manifests as starvation intolerance and gradual loss of white adipose tissue, liver glycogen and muscle mass1. Cancer cells also benefit from autophagy. Deletion of essential autophagy genes impairs the metabolism, proliferation, survival and malignancy of spontaneous tumours in models of autochthonous cancer6,7. Acute, systemic deletion of Atg7 or acute, systemic expression of a dominant-negative ATG4b in mice induces greater regression of KRAS-driven cancers than does tumour-specific autophagy deletion, which suggests that host autophagy promotes tumour growth1,8. Here we show that host-specific deletion of Atg7 impairs the growth of multiple allografted tumours, although not all tumour lines were sensitive to host autophagy status. Loss of autophagy in the host was associated with a reduction in circulating arginine, and the sensitive tumour cell lines were arginine auxotrophs owing to the lack of expression of the enzyme argininosuccinate synthase 1. Serum proteomic analysis identified the arginine-degrading enzyme arginase I (ARG1) in the circulation of Atg7-deficient hosts, and in vivo arginine metabolic tracing demonstrated that serum arginine was degraded to ornithine. ARG1 is predominantly expressed in the liver and can be released from hepatocytes into the circulation. Liver-specific deletion of Atg7 produced circulating ARG1, and reduced both serum arginine and tumour growth. Deletion of Atg5 in the host similarly regulated [corrected] circulating arginine and suppressed tumorigenesis, which demonstrates that this phenotype is specific to autophagy function rather than to deletion of Atg7. Dietary supplementation of Atg7-deficient hosts with arginine partially restored levels of circulating arginine and tumour growth. Thus, defective autophagy in the host leads to the release of ARG1 from the liver and the degradation of circulating arginine, which is essential for tumour growth; this identifies a metabolic vulnerability of cancer. (PMID:30429607)",
        layout=ipywidgets.Layout(width='80%', height='400px')
    )
bbs_widgets['submit_button'] = ipywidgets.Button(
    description='Extract Entities & Properties!',
    layout=ipywidgets.Layout(width='30%')
)

bbs_widgets['out'] = ipywidgets.Output(layout={'border': '0.5px solid black'})

In [None]:
def extract_entities_and_relations(b):
    bbs_widgets['out'].clear_output()
    with bbs_widgets['out']:
        doc = NER_PIPELINE(bbs_widgets['input_text'].value)

        lines = []
        for sent in doc.sents:
            detected_entities = [ent for ent in sent.ents]

            for s_ent in detected_entities:
                # add single lines for entities
                lines.append({'entity': s_ent.text, 
                              'entity_type': s_ent.label_,
                              'start_pos': s_ent.start_char,
                              'end_pos': s_ent.end_char,
                             })

                # extract relations
                for o_ent in detected_entities:
                    so = (s_ent.label_, o_ent.label_)
                    if so in RE_MODELS:
                        preproceses_sent = preprocess_re(doc, sent, s_ent, o_ent, RE_MODELS[so].symbols)
                        property_ = RE_MODELS[so].predict(preproceses_sent)
                        lines.append({'entity': s_ent.text, 
                                      'entity_type': s_ent.label_,
                                      'start_pos': s_ent.start_char,
                                      'end_pos': s_ent.end_char,
                                      'property_type': 'relation',
                                      'property': property_,
                                      'property_value': o_ent.text,
                                      'property_value_type': o_ent.label_
                                     })


        df = pd.DataFrame(lines, columns=HEADERS)
        display(df)

bbs_widgets['submit_button'].on_click(extract_entities_and_relations)

ordered_widgets = list(bbs_widgets.values())
main_widget = ipywidgets.VBox(ordered_widgets)
IPython.display.display(main_widget)

## BlueBrainGraph

This second part of the pipeline starts with the extracted entities and relations in a CSV table, and generates a knowledge graph out of it.

- **input**: csv table of extracted entities/relations
- **output**: knowledge graph