# Experiment with text generation for BMO

In [2]:
import getpass
import json
import random
import rdflib
import re
import requests
import wikipedia
import urllib

from bs4 import BeautifulSoup
from collections import Counter, defaultdict

import numpy as np
import networkx as nx

import spacy

from sklearn.model_selection import train_test_split

from bmo_tools.ontologies import subontology_from_term

from bluegraph import PandasPGFrame
from bluegraph.backends.networkx import pgframe_to_networkx

from kgforge.core.forge import KnowledgeGraphForge
from kgforge.core.resource import Resource
from kgforge.specializations.resources import Dataset

In [4]:
TOKEN = getpass.getpass()

········


In [8]:
ENDPOINT = "https://bbp.epfl.ch/nexus/v1"

In [9]:
forge = KnowledgeGraphForge(
    "https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml",
    endpoint=ENDPOINT,
    token=TOKEN,
    bucket="bbp/kg-to-text")

In [10]:
def is_nan(el):
    try:
        if np.isnan(el):
            return True
    except Exception as e:
        pass
    return False

def search_wiki(query):
    url = "https://en.wikipedia.org/w/api.php"
    
    params = {
        "action": "opensearch",
        "namespace": "0",
        "limit": "5",
        "format": "json",
        "search": query
    }

    result = requests.get(url, params=params).json()
    _, titles, _, uris = result
    if len(titles) > 0:
        return titles[0], uris[0]
    
def query_dbpedia(wiki_url):
    suffix = wiki_url.split("/")[-1]
    dbpedia_id = f"http://dbpedia.org/data/{suffix}.json"

    res = requests.get(dbpedia_id).json()
    incoming_links = []
    for k, v in res.items():
        if k != f"http://dbpedia.org/resource/{suffix}":
            incoming_links.append(k)
    data = res[f"http://dbpedia.org/resource/{suffix}"]

    return data, incoming_links


def search_dbpedia(query):
    url = "https://lookup.dbpedia.org/api/search"
    
    params = {
        "format": "json",
        "query": query
    }
    
    result = requests.get(url, params=params).json()
    results = []
    for el in result["docs"]:
        record = {}
        record["url"] = el['resource'][0]
        record["redirecredLabels"] = set([
            l.replace("<B>", "").replace("</B>", "")
            for l in el.get('redirectlabel', [])
        ])
        record["label"] = [
            l.replace("<B>", "").replace("</B>", "")
            for l in el.get('label', [])
        ]
        record["type"] = el.get("type", None)
        results.append(record)
    return results


def dbpedia_uri_to_wiki(uri):
    suffix = uri.split("/")[-1]
    return f"https://en.wikipedia.org/wiki/{suffix}"
    return wikipedia_uri


def wikipedia_uri_to_db(uri):
    suffix = wiki_url.split("/")[-1]
    return f"http://dbpedia.org/resource/{suffix}"


def extract_wiki_abstract(wiki_url):
    suffix = wiki_url.split("/")[-1]
    dbpedia_id = f"http://dbpedia.org/data/{suffix}.json"

    res = requests.get(dbpedia_id)
    data = res.json()[f"http://dbpedia.org/resource/{suffix}"]

    vals = [
        el["value"]
        for el in data.get("http://dbpedia.org/ontology/abstract", [])
        if el["lang"] == "en"
    ]
    return vals[0] if len(vals) > 0 else None


def extract_full_wiki_text(wiki_url):
    """Adapted from https://levelup.gitconnected.com/two-simple-ways-to-scrape-text-from-wikipedia-in-python-9ce07426579b."""
    html_source = requests.get(wiki_url).text
    soup = BeautifulSoup(html_source,'lxml')

    # Extract the plain text content from paragraphs
    paras = []
    for paragraph in soup.find_all('p'):
        paras.append(str(paragraph.text))

    # Interleave paragraphs & headers
    text = ' '.join(paras)

    # Drop footnote superscripts in brackets
    text = re.sub(r"\[.*?\]+", '', text)

    # Replace '\n' (a new line) with '' and end the string at $1000.
    text = text.replace('\n', '')[:-11]
    return text


def annotate_text(nlp, text):
    url = "https://api.dbpedia-spotlight.org/en/annotate"
    params={
        "text": nlp.make_doc(text)
    }
    params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
    result = requests.get(url, params=params, headers={"accept": "application/json"})
    try:
        json_data = result.json()
    except:
        json_data = None

    return json_data


def text_to_batches(text, max_batch_length):
    text_length = len(text)
    if text_length <= max_batch_length:
        yield text
    else:
        start = 0
        end = 0
        while True:
            if start + max_batch_length > text_length or end == -1:
                break
            end = text.rfind(" ", start, start + max_batch_length + 1)
            yield text[start:end]
            start = end + 1
        yield text[start:]
        

def wiki_url_from_wikidata(wikidata_id, lang):
    result =requests.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbgetentities",
            "format": "json",
            "props": "sitelinks",
            "ids": wikidata_id,
            "sitefilter": lang + "wiki"
        })
    entities = result.json().get("entities")
    if entities:
        entity = entities.get(wikidata_id)
        record = entity["sitelinks"].get(f"{lang}wiki")
        if record:
            title = record["title"]
            return f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"


def get_id_by_label(graph, label):
    uri = None
    for s in graph.subjects(
            rdflib.RDFS.label,
            rdflib.Literal(label, lang="en")):
        uri = s
        break
    if uri is None:
        for s in graph.subjects(
                rdflib.RDFS.label,
                rdflib.Literal(label, datatype=rdflib.XSD.string)):
            uri = s
            break
    if uri is None:
        for s in graph.subjects(
                rdflib.RDFS.label,
                rdflib.Literal(label)):
            uri = s
            break
    return s


def add_annotation(record, label, text, local_start, local_end):
    span = {}

    span["start"] = local_start + len(record["text"]) + 1
    span["end"] = local_end + len(record["text"]) + 1
    record["text"] += " " + text

    span["text"] = label
    span["label"] = str(get_id_by_label(rdf_graph, label))
    
    record["spans"].append(span)

# 1. Generate simple sentences from BMO

Here we generate simple sentences from triples of the ontology:
- definition
- synonyms
- subClassOf relationships

In [11]:
frame = PandasPGFrame.from_ontology(
        filepath="../../ontologies/bbp/bmo.ttl",
        format="turtle", remove_prop_uris=True)

In [12]:
rdf_graph = rdflib.Graph()
rdf_graph.parse("../../ontologies/bbp/bmo.ttl", format="turtle")

<Graph identifier=N90fdd4275711483ea73b4a87e92cf682 (<class 'rdflib.graph.Graph'>)>

In [13]:
BMO_URI = "https://bbp.epfl.ch/ontologies/core/bmo/"

In [14]:
graph = pgframe_to_networkx(frame)

In [15]:
COMMON_EDGE_ALIASES = {
    "IS_SUBCLASS_OF": "is a",
    "isPartOf": "is a part of",
    "hasParameter": "has parameter",
    "modelOf": "is a model of",
    "hasFeature": "has feature",
    "hasType": "has type"
}

In [16]:
graph = pgframe_to_networkx(frame)

In [17]:
synthetic_text = []

In [18]:
for n in graph.nodes():
    print("Node: ", n)
    label = graph.nodes[n]["label"]
    definition = graph.nodes[n]["definition"]
    
    record = {"text": "", "spans": []}
    
    if not is_nan(definition):
        text = f"{label} is {definition}."
        print("\t [D]", text)
        add_annotation(record, label, text, 0, len(label))
    
    synonyms = []

    synonym = graph.nodes[n]["synonym"]
    if not is_nan(synonym):
        if isinstance(synonym, list):
            synonyms += synonym
        else:
            synonyms.append(synonym)
    
    altLabel = graph.nodes[n]["altLabel"]
    if not is_nan(altLabel):
        if isinstance(altLabel, list):
            synonyms += altLabel
        else:
            synonyms.append(altLabel)
            
    prefLabel = graph.nodes[n]["prefLabel"]
    if not is_nan(prefLabel):
        if isinstance(prefLabel, list):
            synonyms += prefLabel
        else:
            synonyms.append(prefLabel)
        
    for synonym in synonyms:
        text = f"{label} is also know as {synonym}."
        print("\t [S]", text)
        add_annotation(record, label, text, 0, len(label))
        
        text = f"{label} is referred to as {synonym}."
        print("\t [S]", text)
        add_annotation(record, label, text, 0, len(label))
    
    example = graph.nodes[n]["example"]
    if not is_nan(example):
        if isinstance(example, list):
            example = ", ".join(example)
        
        text = f"Examples of {label} include {example}."
        print("\t [E]", text)
        add_annotation(record, label, text, 12, 12 + len(label))
        
        text = f"{example} is an example of {label}."
        print("\t [E]", text)
        local_start = len(example) + 18
        add_annotation(record, label, text, local_start, local_start + len(label))
    
        text = f"Different examples of {label} such as {example}."
        print("\t [E]", text)
        local_start = 22
        add_annotation(record, label, text, local_start, local_start + len(label))
        
    for _, neighbor in graph.out_edges(n):
        for r_type in graph.edges[n, neighbor]["@type"]:
            if r_type in COMMON_EDGE_ALIASES:
                r_type = COMMON_EDGE_ALIASES[r_type]
            text = f"{label} {r_type} {neighbor}."
            print("\t [R]", text)
            add_annotation(record, label, text, 0, len(label))
            
    synthetic_text.append(record)
        
    print()

Node:  EModel Hyperparameter
	 [D] EModel Hyperparameter is A parameter used to represent a specific value used in the settings of the EModel building pipeline..
	 [R] EModel Hyperparameter is a Hyperparameter.

Node:  Hyperparameter
	 [R] Hyperparameter is a Entity.

Node:  EModel Parameter Optimization Workflow
	 [R] EModel Parameter Optimization Workflow is a Parameter Optimization Workflow.
	 [R] EModel Parameter Optimization Workflow is a part of EModel Building Workflow.
	 [R] EModel Parameter Optimization Workflow generates Ion Channel Density.
	 [R] EModel Parameter Optimization Workflow generates Ion Dynamics Feature.
	 [R] EModel Parameter Optimization Workflow has parameter EModel Hyperparameter.
	 [R] EModel Parameter Optimization Workflow hasTarget Neuron Electrophysiological Feature.
	 [R] EModel Parameter Optimization Workflow uses Ion Channel Density Constraint.
	 [R] EModel Parameter Optimization Workflow uses Model Ionic Mechanism.
	 [R] EModel Parameter Optimization 

	 [D] Brain Atlas Release is The Atlas Release is an entity which describes the release of a brain atlas. An atlas release can be described by the release date and the template volume, parcellation volume, parcellation ontology and spatial reference system used..
	 [R] Brain Atlas Release is a Entity.

Node:  Oligodendrocyte
	 [S] Oligodendrocyte is also know as Oligodendrocyte.
	 [S] Oligodendrocyte is referred to as Oligodendrocyte.
	 [R] Oligodendrocyte is a Glia.

Node:  Facilitating Synapse Type
	 [R] Facilitating Synapse Type is a Synapse Plasticity Type.

Node:  Persistent Na+ Current
	 [D] Persistent Na+ Current is Persistent sodium current..
	 [R] Persistent Na+ Current is a Ionic Mechanism.
	 [R] Persistent Na+ Current ion Na.

Node:  Pyramidal Neuron
	 [D] Pyramidal Neuron is A neuron with a pyramidal shaped soma and two distinct dendritic trees. The basal dendrites emerge from the base and the apical dendrites from the apex of the pyramidal cell body..
	 [S] Pyramidal Neuro

In [19]:
with open("../data/synthetic_texts.json", "w") as f:
    json.dump(synthetic_text, f)

In [25]:
dataset = Dataset(
    forge,
    name="Synthetic Sentences from BMO",
    description="This dataset contains a set of annotated synthetic texts generated from the BMO.")
dataset.add_distribution(
    "../data/synthetic_texts.json",
    content_type="application/json")
forge.register(dataset)

<action> _register_one
<succeeded> True


# 2. Use Wikipedia to extract some more text

In [200]:
MANUAL_MAP = {
    "EModel": "https://en.wikipedia.org/wiki/Biological_neuron_model",
    "GABA": "https://en.wikipedia.org/wiki/GABA_receptor",
    "Whole Brain": "https://en.wikipedia.org/wiki/Brain",
    "K": "https://en.wikipedia.org/wiki/Potassium",
    "Na": "https://en.wikipedia.org/wiki/Sodium",
    "Ca": "https://en.wikipedia.org/wiki/Calcium"
}

In [123]:
TERMS_TO_IGNORE = {
    "Kv11",
    "Interstitial Space",
    "Brain Area",
    "IBEA",
    "NGV Unit",
    "MO-CMA",
    "SO-CMA",
    "Model Cell",
    "Quantitative Value",
    "Vector3D",
    "Trace",
    "Subject",
    "Brain Layer",
    "Model Glia",
    "Lamp+",
    "SST+",
    "VIP+",
    "cAD",
    "PV+",
    "MEModel",
    "Neuron Part"
}

In [124]:
WIKI_SEARCH_MAPPING = {}
for node in graph.nodes():
    if node not in TERMS_TO_IGNORE and node not in MANUAL_MAP:
        res = search_wiki(node.lower())
        if res is not None:
            WIKI_SEARCH_MAPPING[node] = res[1]

In [125]:
ALL_MAPPED_ENTITIES = {**WIKI_SEARCH_MAPPING, **MANUAL_MAP}

Create a reverse mapping of wiki pages to BMO terms

In [126]:
wiki_to_bmo = {
    v: f"https://bbp.epfl.ch/ontologies/core/bmo/{k.replace(' ', '')}"
    for k, v in ALL_MAPPED_ENTITIES.items()
}

## 2.2. Get Wiki texts for mapped articles

In [218]:
corpus = []
for k, v in ALL_MAPPED_ENTITIES.items():
    corpus.append(extract_full_wiki_text(v))

In [219]:
print("Random extracted text:\n")
index = random.randint(0, len(corpus))
print(corpus[index])

Random extracted text:

A hormone (from the Greek participle ὁρμῶν, "setting in motion") is any member of a class of signaling molecules in multicellular organisms, that are transported by intricate biological processes to distant organs to regulate physiology and behavior. Hormones are required for the correct development of animals, plants and fungi. The lax definition of a hormone (as a signalling molecule that acts distant from its site of production) means that many different classes of molecule can be defined as hormones. Among the substances that can be considered hormones, are eicosanoids (e.g. prostaglandins and thromboxanes), steroids (e.g. oestrogen and brassinosteroid), amino acid derivatives (e.g. epinephrine and auxin), protein / peptides (e.g. insulin and CLE peptides) and gases (e.g ethylene and nitric oxide). Hormones are used to communicate between organs and tissues. In vertebrates, hormones are responsible for the regulation of many physiological processes and behav

Extract 'related' articles

In [None]:
unique_articles = set()
for k in wiki_to_bmo:
    unique_articles.add(k)
    data, incoming_links = query_dbpedia(k)
    print(k)
#     print("In docs: ", len(incoming_links))
    out_links = data.get("http://dbpedia.org/ontology/wikiPageWikiLink", [])
    out_links = [
        el["value"]
        for el in out_links
    ]
#     unique_articles.update(incoming_links)
    unique_articles.update(out_links)

In [221]:
print("If we want to extract related acticles, could retrieve articles: ", len(unique_articles))

If we want to extract related acticles, could retrieve articles:  5845


In [222]:
# for el in unique_articles:
#     url_wiki = dbpedia_uri_to_wiki(el)
#     corpus.append(extract_full_wiki_text(url_wiki))

## 2.3. Use an existing pre-trained ER model

https://github.com/egerber/spaCy-entity-linker

In [223]:
# initialize language model
nlp = spacy.load("en_core_web_lg")

# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

<spacy_entity_linker.EntityLinker.EntityLinker at 0x7f966bf999d0>

In [224]:
wiki_texts = []

In [None]:
WIKIDATA_TO_WIKI = {}
for text in corpus:
    try:
        doc = nlp(text)            
        record = {"text": str(doc), "spans": []}
        # returns all entities in the whole document
        all_linked_entities = doc._.linkedEntities
        for i, entity in enumerate(all_linked_entities):
            wikidata_url = entity.url
            if wikidata_url not in WIKIDATA_TO_WIKI:
                wiki_url = wiki_url_from_wikidata(wikidata_url.split("/")[-1], "en")
                WIKIDATA_TO_WIKI[wikidata_url] = wiki_url
            else:
                wiki_url = WIKIDATA_TO_WIKI[wikidata_url]
            if wiki_url in wiki_to_bmo:
                span = {
                    "text": entity.span.text,
                    "start": entity.span.start_char,
                    "end": entity.span.end_char,
                    "label": wiki_to_bmo[wiki_url]
                }
                record["spans"].append(span)
            if i > 30 and len(record["spans"]) == 0:
                break
        if len(record["spans"]) > 0:
            wiki_texts.append(record)
        
    except Exception as e:
        pass

In [228]:
with open("../data/wiki_texts.json", "w") as f:
    json.dump(wiki_texts, f)

In [26]:
dataset = Dataset(
    forge,
    name="Wikipedia texts for BMO entities",
    description="This dataset contains a set of annotated Wikipedia texts mentioning various BMO entities.")
dataset.add_distribution(
    "../data/wiki_texts.json",
    content_type="application/json")
forge.register(dataset)

<action> _register_one
<succeeded> True
