In [18]:
import requests
import pdf2image
import pytesseract
import nltk
import hashlib
from neo4j import GraphDatabase
import pandas as pd
import itertools

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Briefgarde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
def getDoc(url, pagenumber):
    pdf = requests.get(url) 
    # tissue scale
    doc = pdf2image.convert_from_bytes(pdf.content)
    # Get the article text
    article = []
    for page_number, page_data in enumerate(doc):
        txt = pytesseract.image_to_string(page_data).encode("utf-8")
        # Sixth page are only references
        if page_number < pagenumber:
            article.append(txt.decode("utf-8"))
    article_txt = " ".join(article)
    return article_txt


In [21]:
def clean_text(text, threshold):
    ctext = text.split(threshold)[1]
    """Remove section titles and figure descriptions from text"""
    clean = "\n".join([row for row in ctext.split("\n") if
                      (len(row.split(" "))) > 3 and not (row.startswith("(a)"))
                      and not row.startswith("Figure")])
    return clean

In [22]:
def homemadeTokenizer(doc):
    return nltk.tokenize.sent_tokenize(doc)

In [23]:
def query_plain2(text, url="http://bern2.korea.ac.kr/plain"):
    """Biomedical entity linking API"""
    try:
        response = requests.post(url, json={'text': str(text)})
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None

In [24]:
def getData(sentences):
    entity_list = []
    print(len(sentences))

    i=0
    # The last sentence is invalid
    for s in sentences[:-1]:
        print("doing sentence : " + str(i))
        entity_list.append(query_plain2(s))
        i += 1

    print("all calls done")
    return entity_list

In [25]:
def parseData(entity_list):
    parsed_entities = []

    filtered_entity_list = [entities for entities in entity_list if entities is not None]

    for entities in filtered_entity_list:
        e = []

        # If there are no entities in the text
        if not entities.get('annotations'):
            parsed_entities.append({
                'text': entities['text'],
                'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()
            })
            continue

        for entity in entities['annotations']:
            other_ids = [id for id in entity['id'] if not id.startswith("BERN")]
            entity_type = entity['obj']
            entity_name = entities['text'][entity['span']['begin']:entity['span']['end']]

            try:
                entity_id = [id for id in entity['id'] if id.startswith("BERN")][0]
            except IndexError:
                entity_id = entity_name

            e.append({
                'entity_id': entity_id,
                'other_ids': other_ids,
                'entity_type': entity_type,
                'entity': entity_name
            })

        parsed_entities.append({
            'entities': e,
            'text': entities['text'],
            'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()
        })

    return parsed_entities

In [26]:
host = 'bolt://52.91.239.25:7687'
user = 'neo4j'
password = 'house-round-forearm'
driver = GraphDatabase.driver(host,auth=(user, password))
def neo4j_query(query, params=None):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], 
columns=result.keys())

In [27]:
def getParsedEntity(urlToPDF, numInterestingPage, threshold):
    article_txt = getDoc(urlToPDF, numInterestingPage)
    print("GOt article text")
    ctext = clean_text(article_txt, threshold)
    print("get clean text")
    sentences = homemadeTokenizer(ctext)
    print("got Sentences : ")
    print(sentences)
    entity_list = []
    entity_list = getData(sentences)
    print("Got obtained data from BERN2")
    parsed_entities = parseData(entity_list)
    print("Parsed Everything")
    return parsed_entities
    # By this point, we have the data and everything. We now need to push things to the Graph. 
    

In [28]:
def pushtoGraph(parsedEntity, authors, title):
    neo4j_query("""
    MERGE (b:Article {title: $title})
    WITH b
    UNWIND $authors AS authorName
    MERGE (a:Author {name: authorName})
    MERGE (a)-[:WROTE]->(b)
    """, {'title': title, 'authors': authors})
    print("Created the base article and linked the authors")
    neo4j_query("""
    MATCH (a:Article {title: $title})
    WITH a
    OPTIONAL MATCH (a)-[:HAS_SENTENCE]->(s:Sentence)
    WHERE s IS NULL
    WITH a
    UNWIND $data as row
    MERGE (s:Sentence {id: row.text_sha256})
    SET s.text = row.text
    MERGE (a)-[:HAS_SENTENCE]->(s)
    WITH s, row.entities as entities
    UNWIND entities as entity
    MERGE (e:Entity {id: entity.entity_id})
    ON CREATE SET e.other_ids = entity.other_ids,
                  e.name = entity.entity,
                  e.type = entity.entity_type
    MERGE (s)-[m:MENTIONS]->(e)
    ON CREATE SET m.count = 1
    ON MATCH SET m.count = m.count + 1
    """, {'data': parsedEntity, 'title': title})
    print("Pushed all data to graph !")

For some reasons, the cell below doesn't work unless I put the imports in it. 

In [29]:
from zero_shot_re import RelTaggerModel, RelationExtractor
from transformers import AutoTokenizer
model = RelTaggerModel.from_pretrained("fractalego/fewrel-zero-shot")
print("model loaded")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
print("token loaded")
relations = ['associated', 'interacts']
extractor = RelationExtractor(model, tokenizer, relations)

model loaded
token loaded


In [34]:
def getRelations(parsed_entities):
    present_candidates = [s for s in parsed_entities if (s.get('entities')) and (len(s['entities']) > 1)]
    predicted_rels = []

    for c in present_candidates:
        combinations = itertools.combinations([{'name': x['entity'], 'id': x['entity_id']} for x in c['entities']], 2)

        for combination in list(combinations):
            try:
                ranked_rels = extractor.rank(text=c['text'].replace(",", ""), head=combination[0]['name'], tail=combination[1]['name'])

                # Define threshold for the most probable relation
                if ranked_rels[0][1] > 0.85:
                    predicted_rels.append({
                        'head': combination[0]['id'],
                        'tail': combination[1]['id'],
                        'type': ranked_rels[0][0],
                        'source': c['text_sha256']
                    })
            except Exception as e:
                # Handle exceptions appropriately
                pass
    print("got predicted rels")
    return predicted_rels

In [33]:
def pushRelationToGraph(predictedRels):
    neo4j_query("""
    UNWIND $data as row
    MATCH (source:Entity {id: row.head})
    MATCH (target:Entity {id: row.tail})
    MATCH (text:Sentence {id: row.source})
    MERGE (source)-[:REL]->(r:Relation {type: row.type})-[:REL]-
    >(target)
    MERGE (text)-[:MENTIONS]->(r)
    """, {'data': predictedRels})
    print("Pushed rels to graph")

In [32]:
authors =["Yan Cai", "Linlin Wang", "Ye Wang", "Gerard de Melo", "Ya Zhang", "Yanfeng Wang", "Liang He"]
title = "MedBench: A Large-Scale Chinese Benchmark for Evaluating Medical Large Language Models"
threshold = "Introduction"
numInterestingPage = 7
url = "https://arxiv.org/pdf/2312.12806.pdf"

parsedstuff = getParsedEntity(url, numInterestingPage, threshold)
print("all stuff parsed")
pushtoGraph(parsedstuff, authors, title)
relations = getRelations(parsedstuff)
pushRelationToGraph(relations)

GOt article text
get clean text
got Sentences : 
['The advent of large language models (LLMs) has demon-\nstrated substantial potential for diverse real-world applica-\ntions, thanks to their remarkable language understanding ca-\npabilities.', 'In the medical domain, a notable number of Chi-\nnese medical LLMs have successively emerged, including\nHuaTuo (Wang et al.', '2023), ChatMed (Zhu and Wang 2023),\nBianQue (Chen et al.', '2023), Sunsimiao (Xin Yan 2023), and\nDoctorGLM (Xiong et al.', '2023), to better assist doctors in\ndiverse tasks ranging from clinical diagnosis to disease pre-\nvention (Singhal et al.', '2023a).', 'This underscores an urgent\nneed for a standardized medical benchmark, capable of of-\nfering reliable and authoritative evaluations for such LLMs.', '2 (Chinese Medical Standardization —_ Qualification\na Licensing Exam) — Training Exam Exam\nse Medical Schoo!', 'Residency Doctor in-charge Real-world Case\n3 NMPQE (Medical Licensing Exam)\n.', 'Medical School 

doing sentence : 1
doing sentence : 2
doing sentence : 3
doing sentence : 4
doing sentence : 5
doing sentence : 6
doing sentence : 7
doing sentence : 8
doing sentence : 9
doing sentence : 10
doing sentence : 11
doing sentence : 12
doing sentence : 13
doing sentence : 14
doing sentence : 15
doing sentence : 16
doing sentence : 17
doing sentence : 18
doing sentence : 19
doing sentence : 20
doing sentence : 21
doing sentence : 22
doing sentence : 23
doing sentence : 24
doing sentence : 25
doing sentence : 26
doing sentence : 27
doing sentence : 28
doing sentence : 29
doing sentence : 30
doing sentence : 31
doing sentence : 32
doing sentence : 33
doing sentence : 34
doing sentence : 35
doing sentence : 36
doing sentence : 37
doing sentence : 38
doing sentence : 39
doing sentence : 40
doing sentence : 41
doing sentence : 42
doing sentence : 43
doing sentence : 44
doing sentence : 45
doing sentence : 46
doing sentence : 47
doing sentence : 48
doing sentence : 49
doing sentence : 50
doing sen

In [55]:
from libchebipy._chebi_entity import ChebiEntity

In [76]:
result = neo4j_query("""
MATCH (e:Entity)
WHERE ANY(id IN e.other_ids WHERE id CONTAINS "CHEBI")
WITH e,
  [id in e.other_ids WHERE id CONTAINS "CHEBI" | split(id, ":")[1]][0] as chebiId
RETURN e.name, chebiId
""")

chebiEntity_list = []

for index, row in result.iterrows():
    chebiID = row['chebiId']
    
    try:
        # Attempt to create ChebiEntity
        entity = ChebiEntity(chebiID)
        chebiEntity_list.append(entity)
    except Exception as e:
        continue
chebEntityFiltered = [entity for entity in chebiEntity_list if entity.get_definition() is not None]
        
for entity in chebEntityFiltered:
    print(entity.get_definition())
    
    

A homopolymer, composed of poly(caprolactone) macromolecules.
A homopolymer, composed of poly(caprolactone) macromolecules.
A polymer composed of repeating hydroxyacetic acid units.
A polymer composed of repeating (S)-2-hydroxypropanoyl units.
A macromolecule composed of repeating 2-hydroxypropanoyl units.
A dipeptide formed from L-tyrosine and L-alanine residues.
A dipeptide formed from L-tyrosine and L-glutamic acid residues.
A homopolymer, composed of omega-hydroxypoly(furan-2,5-diylmethylene) macromolecules.
A diamino acid that is caproic (hexanoic) acid bearing two amino substituents at positions 2 and 6.


In [87]:
entities_for_update = [{'chebiId': entity.get_id(), 'chebiDefinition': entity.get_definition()} for entity in chebEntityFiltered]
print(entities_for_update)
# Pass the list of dictionaries as a parameter to the Neo4j query
neo4j_query("""
UNWIND $entities as entity
MATCH (e:Entity)
WHERE ANY(id IN e.other_ids WHERE id = entity.chebiId)
SET e.chebiDefinition = entity.chebiDefinition
""", {'entities': entities_for_update})

[{'chebiId': 'CHEBI:60736', 'chebiDefinition': 'A homopolymer, composed of poly(caprolactone) macromolecules.'}, {'chebiId': 'CHEBI:60736', 'chebiDefinition': 'A homopolymer, composed of poly(caprolactone) macromolecules.'}, {'chebiId': 'CHEBI:53492', 'chebiDefinition': 'A polymer composed of repeating hydroxyacetic acid units.'}, {'chebiId': 'CHEBI:53408', 'chebiDefinition': 'A polymer composed of repeating (S)-2-hydroxypropanoyl units.'}, {'chebiId': 'CHEBI:53407', 'chebiDefinition': 'A macromolecule composed of repeating 2-hydroxypropanoyl units.'}, {'chebiId': 'CHEBI:74879', 'chebiDefinition': 'A dipeptide formed from L-tyrosine and L-alanine residues.'}, {'chebiId': 'CHEBI:74883', 'chebiDefinition': 'A dipeptide formed from L-tyrosine and L-glutamic acid residues.'}, {'chebiId': 'CHEBI:60594', 'chebiDefinition': 'A homopolymer, composed of omega-hydroxypoly(furan-2,5-diylmethylene) macromolecules.'}, {'chebiId': 'CHEBI:25094', 'chebiDefinition': 'A diamino acid that is caproic (he