In [1]:
import spacy
import crosslingual_coreference

[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets
sent = '<s><triplet> Jennifer Anne Doudna <subj> February 19, 1964 <obj> date of birth <subj> Nobel Prize in Chemistry <obj> award received <triplet> Emmanuelle Charpentier <subj> Nobel Prize in Chemistry <obj> award received</s>'
print(extract_triplets(sent))

[{'head': 'Jennifer Anne Doudna', 'type': 'date of birth', 'tail': 'February 19, 1964'}, {'head': 'Jennifer Anne Doudna', 'type': 'award received', 'tail': 'Nobel Prize in Chemistry'}, {'head': 'Emmanuelle Charpentier', 'type': 'award received', 'tail': 'Nobel Prize in Chemistry'}]


In [22]:
"""sent = '<s><triplet> Jennifer Anne Doudna <subj> February 19, 1964 <obj> date of birth <subj> Nobel Prize in Chemistry <obj> award received <triplet> Emmanuelle Charpentier <subj> Nobel Prize in Chemistry <obj> award received</s>'
def extract_triplets(text):
    text = text.replace("<s>","")
    text = text.replace("</s>","")
    if text.startswith("<triplet>"):
        text = text[len("<triplet>"):].strip()    
    triplets = []
    for sent in text.strip().split("<triplet>"):
        sent = sent.strip()
        print(sent)
        parts = []
        for token in sent.split(">"):
            parts.append([w.strip() for w in token.split("<")])
        print(len(parts))
        assert len(parts)%3 == 0
        for n in range(len(parts)//3):
            print(n)
            start = n*3
            end = n*3+3
            head,tail,type = [p[0] for p in parts[start:end]]
            triplets.append({'head':head,"type":type,"tail":tail})
    return triplets
print(extract_triplets(sent))"""

'sent = \'<s><triplet> Jennifer Anne Doudna <subj> February 19, 1964 <obj> date of birth <subj> Nobel Prize in Chemistry <obj> award received <triplet> Emmanuelle Charpentier <subj> Nobel Prize in Chemistry <obj> award received</s>\'\ndef extract_triplets(text):\n    text = text.replace("<s>","")\n    text = text.replace("</s>","")\n    if text.startswith("<triplet>"):\n        text = text[len("<triplet>"):].strip()    \n    triplets = []\n    for sent in text.strip().split("<triplet>"):\n        sent = sent.strip()\n        print(sent)\n        parts = []\n        for token in sent.split(">"):\n            parts.append([w.strip() for w in token.split("<")])\n        print(len(parts))\n        assert len(parts)%3 == 0\n        for n in range(len(parts)//3):\n            print(n)\n            start = n*3\n            end = n*3+3\n            head,tail,type = [p[0] for p in parts[start:end]]\n            triplets.append({\'head\':head,"type":type,"tail":tail})\n    return triplets\nprint

In [3]:
# Add rebel component https://github.com/Babelscape/rebel/blob/main/spacy_component.py
import requests
import re
import hashlib
from spacy import Language
from typing import List

from spacy.tokens import Doc, Span

from transformers import pipeline

def call_wiki_api(item):
  try:
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'id-less'


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": -1,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
          Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
          return mapping
        else:
          res = call_wiki_api(item)
          self.entity_mapping[item] = res
          return res

    
    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:
            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [4]:
DEVICE = -1 # Number of the GPU, -1 if want to use CPU

# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": DEVICE})

# Define rel extraction model

rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )

Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-st

<__main__.RebelComponent at 0x7fa682ce6a10>

In [5]:

input_text = "Christian Drosten works in Germany. He likes to work for Google."

coref_text = coref(input_text)._.resolved_text

doc = rel_ext(coref_text)

for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

385d9ac30c35635c7e36a1636ded4a091f830cf3: {'relation': 'country of citizenship', 'head_span': {'text': 'Christian Drosten', 'id': 'Q1079331'}, 'tail_span': {'text': 'Germany', 'id': 'Q183'}}
471a35571b66cc8e1e3415e27f7086505310efc6: {'relation': 'employer', 'head_span': {'text': 'Christian Drosten', 'id': 'Q1079331'}, 'tail_span': {'text': 'Google', 'id': 'Q95'}}


In [6]:
coref_text

'Christian Drosten works in Germany. Christian Drosten likes to work for Google.'

In [7]:
doc._.rel

{'385d9ac30c35635c7e36a1636ded4a091f830cf3': {'relation': 'country of citizenship',
  'head_span': {'text': 'Christian Drosten', 'id': 'Q1079331'},
  'tail_span': {'text': 'Germany', 'id': 'Q183'}},
 '471a35571b66cc8e1e3415e27f7086505310efc6': {'relation': 'employer',
  'head_span': {'text': 'Christian Drosten', 'id': 'Q1079331'},
  'tail_span': {'text': 'Google', 'id': 'Q95'}}}

In [25]:
import pandas as pd
import wikipedia
from neo4j import GraphDatabase

# Define Neo4j connection
host = 'bolt://127.0.0.1:7687'
user = 'neo4j'
password = '12345678'
driver = GraphDatabase.driver(host,auth=(user, password))

In [26]:
driver.get_server_info()

<neo4j.api.ServerInfo at 0x7fa63f7f8a10>

In [31]:
import_query = """
UNWIND $data AS row
MERGE (h:Entity {id: CASE WHEN NOT row.head_span.id = 'id-less' THEN row.head_span.id ELSE row.head_span.text END})
ON CREATE SET h.text = row.head_span.text
MERGE (t:Entity {id: CASE WHEN NOT row.tail_span.id = 'id-less' THEN row.tail_span.id ELSE row.tail_span.text END})
ON CREATE SET t.text = row.tail_span.text
WITH row, h, t
CALL apoc.merge.relationship(h, toUpper(replace(row.relation,' ', '_')),
  {},
  {},
  t,
  {}
)
YIELD rel
RETURN distinct 'done' AS result;
"""


def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

def store_wikipedia_summary(page):
  #try:
  input_text = wikipedia.page(page).summary
  print(input_text)
  coref_text = coref(input_text)._.resolved_text
  doc = rel_ext(coref_text)
  params = [rel_dict for value, rel_dict in doc._.rel.items()]
  run_query(import_query, {'data': params})
  #except Exception as e:
  #  print(f"Couldn't parse text for {page} due to {e}")

In [32]:

ladies = ["Jennifer Doudna", "Rachel Carson", "Sara Seager OC", "Gertrude Elion", "Rita Levi-Montalcini"]

for l in ladies:
  print(f"Parsing {l}")
  store_wikipedia_summary(l)
     

Parsing Jennifer Doudna
Jennifer Anne Doudna  (; born February 19, 1964) is an American biochemist who has done pioneering work in CRISPR gene editing, and made other fundamental contributions in biochemistry and genetics. Doudna was one of the first women to share a Nobel in the sciences. She received the 2020 Nobel Prize in Chemistry, with Emmanuelle Charpentier, "for the development of a method for genome editing." She is the Li Ka Shing Chancellor's Chair Professor in the department of chemistry and the department of molecular and cell biology at the University of California, Berkeley. She has been an investigator with the Howard Hughes Medical Institute since 1997.
Doudna graduated from Pomona College in 1985 and earned a Ph.D. from Harvard Medical School in 1989. Apart from her professorship at Berkeley, she is also president and chair of the board of the Innovative Genomics Institute, a faculty scientist at Lawrence Berkeley National Laboratory, a senior investigator at the Glad

In [33]:

run_query("""
CALL apoc.periodic.iterate("
  MATCH (e:Entity)
  WHERE e.id STARTS WITH 'Q'
  RETURN e
","
  // Prepare a SparQL query
  WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + e.id + ') filter (lang(?name) = \\"en\\") ' +
     'OPTIONAL {?item wdt:P31 [rdfs:label ?label] .filter(lang(?label)=\\"en\\")}}' AS sparql, e
  // make a request to Wikidata
  CALL apoc.load.jsonParams(
    'https://query.wikidata.org/sparql?query=' + 
      + apoc.text.urlencode(sparql),
      { Accept: 'application/sparql-results+json'}, null)
  YIELD value
  UNWIND value['results']['bindings'] as row
  SET e.wikipedia_name = row.name.value
  WITH e, row.label.value AS label
  MERGE (c:Class {id:label})
  MERGE (e)-[:INSTANCE_OF]->(c)
  RETURN distinct 'done'", {batchSize:1, retry:1})
""")
     

Unnamed: 0,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
0,50,50,0,46,4,4,0,{'Cannot merge the following node because of n...,"{'total': 50, 'committed': 46, 'failed': 4, 'e...","{'total': 50, 'committed': 46, 'failed': 4, 'e...",False,{},"{'nodesDeleted': 0, 'labelsAdded': 0, 'relatio..."
