# Identify relations between entities extracted from step 2

### Import required libraries

In [1]:
import pandas as pd
import json
import re
from nltk.tokenize import sent_tokenize
import spacy
from itertools import combinations

### Load data

In [2]:
with open('data/abstracts_raw.json', 'r', encoding='utf-8') as f:
    abstracts = json.load(f)


### Load model

In [3]:
nlp = spacy.load("en_ner_bc5cdr_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


### Clean text

In [4]:
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

### Extract entities

In [6]:
entity_rows = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    doc = nlp(text)
    for ent in doc.ents:
        entity_rows.append({
            'id': doi,
            'entity': ent.text,
            'entity_type': ent.label_
        })

entities_df = pd.DataFrame(entity_rows)
entities_df.to_csv('data/entities_extracted.csv', index=False)
print(f"Extracted {len(entities_df)} entities.")

Extracted 163 entities.


### Map entities to sentences and generate edges

In [7]:
edges = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    sentences = sent_tokenize(text)
    abstract_entities = entities_df[entities_df['id'] == doi]['entity'].tolist()

    for sent in sentences:
        entities_in_sentence = [e for e in abstract_entities if e in sent]
        if len(entities_in_sentence) > 1:
            for source, target in combinations(entities_in_sentence, 2):
                edges.append({
                    'source': source,
                    'target': target,
                    'relation': 'unknown',
                    'sentence': sent,
                    'doi': doi
                })

edges_df = pd.DataFrame(edges)
edges_df.to_csv('data/relations.csv', index=False)
print(f"Generated {len(edges_df)} candidate edges.")

Generated 1091 candidate edges.
