In [None]:
!sudo apt install tesseract-ocr
!sudo apt-get install poppler-utils
!pip install zero-shot-re neo4j
!pip install pdf2image
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 18 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,683 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [None]:
import requests
import pdf2image
import pytesseract

pdf = requests.get('https://arxiv.org/pdf/2110.03526.pdf')
doc = pdf2image.convert_from_bytes(pdf.content)

# Get the article text
article = []
for page_number, page_data in enumerate(doc):
    txt = pytesseract.image_to_string(page_data).encode("utf-8")
    # Sixth page are only references
    if page_number < 6:
      article.append(txt.decode("utf-8"))
article_txt = " ".join(article)


In [None]:
import nltk
nltk.download('punkt')

def clean_text(text):
  """Remove section titles and figure descriptions from text"""
  clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)"))
                    and not row.startswith("Figure")])
  return clean

text = article_txt.split("INTRODUCTION")[1]
ctext = clean_text(text)
sentences = nltk.tokenize.sent_tokenize(ctext)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import hashlib
import json
headers = {
  'Content-Type': 'application/json'
}

def query_raw(text, url="http://bern2.korea.ac.kr/plain"):
  """Biomedical entity linking API"""
  resp = requests.post(url, data=json.dumps({'text': text}), headers=headers)
  print(resp.text)
  return resp.json()

entity_list = []
# The last sentence is invalid
for s in sentences[:-1]:
  entity_list.append(query_raw(s))

parsed_entities = []
for entities in entity_list:
  e = []
  # If there are not entities in the text
  if not entities.get('annotations'):
    parsed_entities.append({'text':entities['text'], 'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})
    continue
  for entity in entities['annotations']:
    other_ids = [id for id in entity['id'] if not id.startswith("BERN")]
    entity_type = entity['obj']
    entity_name = entities['text'][entity['span']['begin']:entity['span']['end']]
    try:
      entity_id = [id for id in entity['id'] if id.startswith("BERN")][0]
    except IndexError:
      entity_id = entity_name
    e.append({'entity_id': entity_id, 'other_ids': other_ids, 'entity_type': entity_type, 'entity': entity_name})
  parsed_entities.append({'entities':e, 'text':entities['text'], 'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})

{"annotations": [{"id": ["NCBITaxon:9606"], "is_neural_normalized": false, "mention": "people", "obj": "species", "prob": 0.9871211647987366, "span": {"begin": 5, "end": 11}}, {"id": ["mesh:D012871"], "is_neural_normalized": false, "mention": "skin diseases", "obj": "disease", "prob": 0.9991875886917114, "span": {"begin": 17, "end": 30}}, {"id": ["mesh:D002908"], "is_neural_normalized": true, "mention": "chronic wounds", "obj": "disease", "prob": 0.9991545081138611, "span": {"begin": 39, "end": 53}}, {"id": ["mesh:D003668"], "is_neural_normalized": true, "mention": "non-healing and diabetic ulcers", "obj": "disease", "prob": 0.9781932234764099, "span": {"begin": 55, "end": 86}}], "text": "Many people with skin diseases such as chronic wounds, non-healing and diabetic ulcers need reconstruction and regeneration of their skin.", "timestamp": "Fri Sep 29 11:04:19 +0000 2023"}
{"annotations": [{"id": ["NCBITaxon:9606"], "is_neural_normalized": false, "mention": "people", "obj": "species", 

In [None]:
!pip install neo4j
from neo4j import GraphDatabase
import pandas as pd

host = 'neo4j+s://dataset.databases.neo4j.io'
user = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(host,auth=(user, password))

def neo4j_query(query, params=None):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())



In [None]:
author = article_txt.split("\n")[0]
title = " ".join(article_txt.split("\n")[2:4])

neo4j_query("""
MERGE (a:Author{name:$author})
MERGE (b:Article{title:$title})
MERGE (a)-[:WROTE]->(b)
""", {'title':title, 'author':author})

In [None]:
neo4j_query("""
MATCH (a:Article)
UNWIND $data as row
MERGE (s:Sentence{id:row.text_sha256})
SET s.text = row.text
MERGE (a)-[:HAS_SENTENCE]->(s)
WITH s, row.entities as entities
UNWIND entities as entity
MERGE (e:Entity{id:entity.entity_id})
ON CREATE SET e.other_ids = entity.other_ids,
              e.name = entity.entity,
              e.type = entity.entity_type
MERGE (s)-[m:MENTIONS]->(e)
ON CREATE SET m.count = 1
ON MATCH SET m.count = m.count + 1
""", {'data': parsed_entities})

No charts were generated by quickchart


In [None]:
neo4j_query("""
MATCH (e:Entity)<-[:MENTIONS]-(s:Sentence)
WHERE e.name = "autoimmune diseases"
RETURN s.text as result
""")

Unnamed: 0,result
0,"These cells, later found to be hematopoietic s..."


In [None]:
neo4j_query("""
MATCH (e1:Entity)<-[:MENTIONS]-()-[:MENTIONS]->(e2:Entity)
WHERE id(e1) < id(e2)
RETURN e1.name as entity1, e2.name as entity2, count(*) as cooccurrence
ORDER BY cooccurrence
DESC LIMIT 3
""")

Unnamed: 0,entity1,entity2,cooccurrence
0,collagen,fibroblasts,6
1,mesenchymal stem cells,collagen,4
2,mesenchymal stem cells,fibroblasts,4


In [None]:
neo4j_query("""
MATCH (a:Author)-[:WROTE]->()-[:HAS_SENTENCE]->()-[:MENTIONS]->(e:Entity)
RETURN a.name as author, e.name as entity, count(*) as count
ORDER BY count DESC
LIMIT 5
""")

Unnamed: 0,author,entity,count
0,Mohammadreza Ahmadi,collagen,9
1,Mohammadreza Ahmadi,fibroblasts,8
2,Mohammadreza Ahmadi,stem cell,7
3,Mohammadreza Ahmadi,ADSCs,7
4,Mohammadreza Ahmadi,mesenchymal stem cells,7


In [None]:
# !pip install transformers
!pip install scratch

from transformers import AutoTokenizer
from zero_shot_re import RelTaggerModel, RelationExtractor
from zero_shot_re import RelTaggerModel, RelationExtractor

model = RelTaggerModel.from_pretrained("fractalego/fewrel-zero-shot")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
relations = ['associated', 'interacts']
extractor = RelationExtractor(model, tokenizer, relations)

Collecting scratch
  Downloading scratch-1.0.0.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scratch
  Building wheel for scratch (setup.py) ... [?25l[?25hdone
  Created wheel for scratch: filename=scratch-1.0.0-py2.py3-none-any.whl size=4891 sha256=c3fc5a449ce9e88ed17b0f6cbf04a8a4dc4b37b6ca6e79ffab22145cd11b8c69
  Stored in directory: /root/.cache/pip/wheels/3d/bf/6e/e1ae84c0715e36d2c2c808a0ef17c289866ca1e79c32ad378c
Successfully built scratch
Installing collected packages: scratch
Successfully installed scratch-1.0.0


ModuleNotFoundError: ignored

In [None]:
import itertools
# Candidate sentence where there is more than a single entity present
candidates = [s for s in parsed_entities if (s.get('entities')) and (len(s['entities']) > 1)]
predicted_rels = []
for c in candidates:
  combinations = itertools.combinations([{'name':x['entity'], 'id':x['entity_id']} for x in c['entities']], 2)
  for combination in list(combinations):
    try:
      ranked_rels = extractor.rank(text=c['text'].replace(",", " "), head=combination[0]['name'], tail=combination[1]['name'])
      # Define threshold for the most probable relation
      if ranked_rels[0][1] > 0.85:
        predicted_rels.append({'head': combination[0]['id'], 'tail': combination[1]['id'], 'type':ranked_rels[0][0], 'source': c['text_sha256']})
    except:
      pass

neo4j_query("""
UNWIND $data as row
MATCH (source:Entity {id: row.head})
MATCH (target:Entity {id: row.tail})
MATCH (text:Sentence {id: row.source})
MERGE (source)-[:REL]->(r:Relation {type: row.type})-[:REL]->(target)
MERGE (text)-[:MENTIONS]->(r)
""", {'data': predicted_rels})

You can examine the extracted relationships between entities and the source text with the following Cypher query:

In [None]:
neo4j_query("""
MATCH (s:Entity)-[:REL]->(r:Relation)-[:REL]->(t:Entity), (r)<-[:MENTIONS]-(st:Sentence)
RETURN s.name as source_entity, t.name as target_entity, r.type as type, st.text as source_text
""")

Unnamed: 0,source_entity,target_entity,type,source_text
0,skin diseases,chronic wounds,associated,Many people with skin diseases such as chronic...
1,skin diseases,diabetic ulcers,associated,Many people with skin diseases such as chronic...
2,leukemia,autoimmune diseases,associated,"These cells, later found to be hematopoietic s..."
3,ADSCs,DFs proteins,interacts,"Furthermore, the primary sources of extracellu..."


In [None]:
# mesh enrichment
neo4j_query("""
MATCH (e:Entity)
WHERE e.name = "Epidermolysis bullosa"
WITH e,
    [id in e.other_ids WHERE id contains "MESH" | split(id,":")[1]][0] as meshId
CALL apoc.load.json("https://id.nlm.nih.gov/mesh/lookup/details?descriptor=" + meshId) YIELD value
RETURN value
""")

Unnamed: 0,value
0,{'qualifiers': [{'resource': 'http://id.nlm.ni...
