# Identify relations between entities extracted from step 2

### Import required libraries

In [8]:
import pandas as pd
import json
import re
from nltk.tokenize import sent_tokenize
import spacy
from itertools import combinations
from spacy.matcher import PhraseMatcher
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline


### Load data

In [9]:
with open('data/abstracts_raw.json', 'r', encoding='utf-8') as f:
    abstracts = json.load(f)


### Load model

In [10]:
nlp = spacy.load("en_ner_bc5cdr_md")

### Clean text

In [11]:
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

### Extract entities

In [12]:
entity_rows = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    doc = nlp(text)
    for ent in doc.ents:
        entity_rows.append({
            'id': doi,
            'entity': ent.text,
            'entity_type': ent.label_
        })

entities_df = pd.DataFrame(entity_rows)
entities_df = entities_df.drop_duplicates(subset=['id', 'entity', 'entity_type'])
entities_df.to_csv('data/entities_extracted.csv', index=False)
print(f"Extracted {len(entities_df)} entities.")

Extracted 109 entities.


### Using PhraseMatcher to locate multi-worded entities

In [13]:
def get_entity_tokens(doc, entity_text):
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    pattern = nlp(entity_text)
    matcher.add("ENTITY", [pattern])
    matches = matcher(doc)
    tokens = []
    for _, start, end in matches:
        tokens.extend(doc[start:end])
    return tokens

### Map entities to sentences and generate edges

In [14]:
re_model_name = "michiyasunaga/BioLinkBERT-base"
re_tokenizer = AutoTokenizer.from_pretrained(re_model_name)
re_model = AutoModelForSequenceClassification.from_pretrained(re_model_name)

re_pipe = pipeline(
    "text-classification",
    model=re_model,
    tokenizer=re_tokenizer,
    return_all_scores=True
)

edges = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    sentences = sent_tokenize(text)
    abstract_entities = entities_df[entities_df['id'] == doi]['entity'].tolist()

    for sent in sentences:
        entities_in_sentence = [e for e in abstract_entities if e in sent]

        if len(entities_in_sentence) > 1:
            for e1, e2 in combinations(entities_in_sentence, 2):
                doc_sent = nlp(sent)
                e1_tokens = get_entity_tokens(doc_sent, e1)
                e2_tokens = get_entity_tokens(doc_sent, e2)

                if e1_tokens and e2_tokens:
                    source, target = (e1, e2) if e1_tokens[0].i < e2_tokens[0].i else (e2, e1)

                    marked_sentence = sent.replace(source, f"[E1] {source} [/E1]").replace(target, f"[E2] {target} [/E2]")

                    pred = re_pipe(marked_sentence)[0]
                    pred_sorted = sorted(pred, key=lambda x: x['score'], reverse=True)
                    rel = pred_sorted[0]['label']
                    conf = pred_sorted[0]['score']

                    token_distance = abs(e1_tokens[0].i - e2_tokens[0].i)
                    distance_score = 1 / (1 + token_distance)

                    edges.append({
                        'source': source,
                        'target': target,
                        'relation': rel,
                        'confidence': conf,
                        'sentence': sent,
                        'doi': doi,
                        'distance_score': distance_score
                    })

edges_df = pd.DataFrame(edges)
edges_df = edges_df.drop_duplicates(subset=['source', 'target', 'relation', 'doi'])
edges_df.to_csv('data/relations.csv', index=False)
print(f"Generated {len(edges_df)} model-predicted relations.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Generated 70 model-predicted relations.
