# Identify relations between entities extracted from step 2

### Import required libraries

In [1]:
import pandas as pd
import json
import re
from nltk.tokenize import sent_tokenize
import spacy
from itertools import combinations

### Load data

In [3]:
with open('data/abstracts_raw.json', 'r', encoding='utf-8') as f:
    abstracts = json.load(f)


### Load model

In [4]:
nlp = spacy.load("en_ner_bc5cdr_md")

### Clean text

In [5]:
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

### Extract entities

In [6]:
entity_rows = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    doc = nlp(text)
    for ent in doc.ents:
        entity_rows.append({
            'id': doi,
            'entity': ent.text,
            'entity_type': ent.label_
        })

entities_df = pd.DataFrame(entity_rows)
entities_df.to_csv('data/entities_extracted.csv')