# Identify relations between entities extracted from step 2

### Import required libraries

In [11]:
import pandas as pd
import json
import re
from nltk.tokenize import sent_tokenize
import spacy
from itertools import combinations
from spacy.matcher import PhraseMatcher

### Load data

In [2]:
with open('data/abstracts_raw.json', 'r', encoding='utf-8') as f:
    abstracts = json.load(f)


### Load model

In [3]:
nlp = spacy.load("en_ner_bc5cdr_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


### Clean text

In [4]:
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

### Extract entities

In [6]:
entity_rows = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    doc = nlp(text)
    for ent in doc.ents:
        entity_rows.append({
            'id': doi,
            'entity': ent.text,
            'entity_type': ent.label_
        })

entities_df = pd.DataFrame(entity_rows)
entities_df.to_csv('data/entities_extracted.csv', index=False)
print(f"Extracted {len(entities_df)} entities.")

Extracted 163 entities.


### Using PhraseMatcher to locate multi-worded entities

In [13]:
def get_entity_tokens(doc, entity_text):
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    pattern = nlp(entity_text)
    matcher.add("ENTITY", [pattern])
    matches = matcher(doc)
    tokens = []
    for _, start, end in matches:
        tokens.extend(doc[start:end])
    return tokens

### Extract relations

In [14]:
def extract_relation(sentence, entity1, entity2):
    doc = nlp(sentence)

    ent1_tokens = get_entity_tokens(doc, entity1)
    ent2_tokens = get_entity_tokens(doc, entity2)

    if not ent1_tokens or not ent2_tokens:
        return 'unknown'
    
    ent1_token = ent1_tokens[0]
    ent2_token = ent2_tokens[0]

    ancestors_ent1 = list(ent1_token.ancestors)
    ancestors_ent2 = list(ent2_token.ancestors)

    for anc in ancestors_ent1:
        if anc in ancestors_ent2 and anc.pos_ == 'VERB':
            return anc.lemma_
        
    for token in doc:
        if token.pos_ == 'VERB':
            return token.lemma_
        
    return 'unknown'

### Map entities to sentences and generate edges

In [16]:
edges = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    sentences = sent_tokenize(text)
    abstract_entities = entities_df[entities_df['id'] == doi]['entity'].tolist()

    for sent in sentences:
        entities_in_sentence = [e for e in abstract_entities if e in sent]

        if len(entities_in_sentence) > 1:
            for e1, e2 in combinations(entities_in_sentence, 2):
                doc_sent = nlp(sent)
                e1_tokens = get_entity_tokens(doc_sent, e1)
                e2_tokens = get_entity_tokens(doc_sent, e2)

                if e1_tokens and e2_tokens:
                    source, target = (e1, e2) if e1_tokens[0].i < e2_tokens[0].i else (e2, e1)
                    rel = extract_relation(sent, source, target)
                    edges.append({
                        'source': source,
                        'target': target,
                        'relation': rel,
                        'sentence': sent,
                        'doi': doi
                    })

edges_df = pd.DataFrame(edges)
edges_df.to_csv('data/relations.csv', index=False)
print(f"Generated {len(edges_df)} candidate edges.")

Generated 1028 candidate edges.
