## Install necessary libraries

In [None]:
!pip install crosslingual-coreference neo4j mysql-connector-python
!pip install transformers
!python -m spacy download en_core_web_sm

Collecting crosslingual-coreference
  Downloading crosslingual_coreference-0.3.1-py3-none-any.whl (12 kB)
Collecting neo4j
  Downloading neo4j-5.14.0.tar.gz (192 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.4/192.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting mysql-connector-python
  Downloading mysql_connector_python-8.2.0-cp310-cp310-manylinux_2_17_x86_64.whl (31.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.6/31.6 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting allennlp<2.10,>=2.9 (from crosslingual-coreference)
  Downloading allennlp-2.9.3-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.6/719.6 kB[0m [31m60.1 MB/s[0m eta [36

## Importing required modules

In [None]:
import spacy
import crosslingual_coreference

import requests
import re
import hashlib

import pandas as pd

import mysql.connector
from neo4j import GraphDatabase

from spacy import Language
from spacy.tokens import Doc, Span
from typing import List
from transformers import pipeline

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## Custom pipeline component in SpaCy for extracting and annotating triplets from text generated by the REBEL model, incorporating a custom Wiki API function for entity identification and mapping

In [None]:
def call_wiki_api(item):
  try:
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
    data = requests.get(url).json()

    result = data['search'][0]
    item_id = result['id']
    category = result.get('description', 'No category available')
    return {'id': item_id, 'category': category}
  except Exception as e:
    print(f"An error occurred while getting {item} information: {e}")
    return {'id': 'id-less', 'category': 'No category available'}

# Function to parse the generated text and extract the triplet
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets

@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
          Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
          return mapping
        else:
          res = call_wiki_api(item)
          self.entity_mapping[item] = res
          return res

    def _generate_triplets(self, sent: Span) -> List[dict]:
          output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
          extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
          extracted_triplets = extract_triplets(extracted_text[0])
          return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:
            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text since sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
              continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                head_data = self.get_wiki_id(triplet['head'])
                tail_data = self.get_wiki_id(triplet['tail'])
                doc._.rel[index] = {"relation": triplet["type"],
                                    "head_span": {'text': triplet['head'],
                                                  'id': head_data['id'],
                                                  'category': head_data['category']},
                                    "tail_span": {'text': triplet['tail'],
                                                  'id': tail_data['id'],
                                                  'category': tail_data['category']}
                                   }

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

## Initializes coreference resolution using the 'xx_coref' model and relation extraction using the 'rebel' model in SpaCy

In [None]:
DEVICE = -1 # Number of the GPU, -1 if want to use CPU

# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": DEVICE}
)

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE,
    'model_name':'Babelscape/rebel-large'}
)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
models/crosslingual-coreference/minilm/model.tar.gz: 358490KB [00:12, 28087.52KB/s]                            
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/225M [00:00<?, ?B/s]

Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-st

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

<__main__.RebelComponent at 0x7cc8fa312e90>

## Connect to a Neo4j database and performs entity-relationship extraction received texts, resolving coreferences, and imports the extracted relationships into the Neo4j database

In [None]:
# Define Neo4j connection
neo4j_host = 'bolt+s://079be4b8.databases.neo4j.io:7687'
neo4j_user = 'neo4j'
neo4j_password = 'R14aXU-KlJv95Etmi6eU8jQrMwxF8x0EthZ_fu9Q_wo'
neo4j_driver = GraphDatabase.driver(neo4j_host,auth=(neo4j_user, neo4j_password))

# # Define MySQL connection
# mysql_con = mysql.connector.connect(
#     host="ayurconnect.mysql.database.azure.com",
#     user="service_admin",
#     password="123.Abc.",
#     database="ayurconnect",
# )

# cursor = mysql_con.cursor()
# cursor.execute("SELECT c.Body, r.Body FROM contents c RIGHT OUTER JOIN responses r ON c.Id = r.ContentId WHERE c.Id = r.ContentId OR c.ContentType = 'ARTICLE' AND c.IsDeleted = 0;")
# corpus_data = [f"{row[0]} {row[1]} {row[2]}" for row in cursor.fetchall()]
corpus_data = ["Sugar is not good for diabetes patients.", "Proteins are good for diabetes patients", "Dengue fever typically manifests after an incubation period of 4-10 days following a mosquito bite from an infected mosquito. High fever (40°C/104°F) is usually accompanied by at least two symptoms: headaches (pain behind the eyes), muscle and joint pains, nausea, vomiting, swollen glands, and rash."]

import_query = """
UNWIND $data AS row
MERGE (h:Entity {id: CASE WHEN NOT row.head_span.id = 'id-less' THEN row.head_span.id ELSE row.head_span.text END})
ON CREATE SET h.text = row.head_span.text, h.category = row.head_span.category
MERGE (t:Entity {id: CASE WHEN NOT row.tail_span.id = 'id-less' THEN row.tail_span.id ELSE row.tail_span.text END})
ON CREATE SET t.text = row.tail_span.text, t.category = row.tail_span.category
WITH row, h, t
CALL apoc.merge.relationship(h, toUpper(replace(row.relation,' ', '_')),
  {},
  {},
  t,
  {}
)
YIELD rel
RETURN distinct 'done' AS result;
"""

def run_query(query, params={}):
    with neo4j_driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

for text in corpus_data:
    coref_text = coref(text)._.resolved_text
    doc = rel_ext(coref_text)

    for value, rel_dict in doc._.rel.items():
        try:
            print(f"{value}: {rel_dict}")
            params = [rel_dict for value, rel_dict in doc._.rel.items()]
            run_query(import_query, {'data': params})
        except Exception as e:
            print(f"Couldn't parse text for {params} due to {e}")

514a7bd30bfc653a91af0544dfdfe5da147d0f09: {'relation': 'has effect', 'head_span': {'text': 'Sugar', 'id': 'Q23118', 'category': 'chemical compound'}, 'tail_span': {'text': 'diabetes', 'id': 'Q12206', 'category': 'group of metabolic disorders characterized by high blood sugar levels over a prolonged period'}}
f3bdb2c08b9c61ee58276a11db70eba4a6476094: {'relation': 'has cause', 'head_span': {'text': 'diabetes', 'id': 'Q12206', 'category': 'group of metabolic disorders characterized by high blood sugar levels over a prolonged period'}, 'tail_span': {'text': 'Sugar', 'id': 'Q23118', 'category': 'chemical compound'}}
7aab6f4429c95652804bbe6b37fedf2d06c15444: {'relation': 'drug used for treatment', 'head_span': {'text': 'diabetes', 'id': 'Q12206', 'category': 'group of metabolic disorders characterized by high blood sugar levels over a prolonged period'}, 'tail_span': {'text': 'Protein', 'id': 'Q8054', 'category': 'biomolecule consisting of chains of amino acid residues'}}
f8499e01a53e7cc9bd5