# Identify relations between entities extracted from step 2

### Import required libraries

In [8]:
import pandas as pd
import json
import re
from nltk.tokenize import sent_tokenize
import spacy
from itertools import combinations
from spacy.matcher import PhraseMatcher
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline


### Load data

In [9]:
with open('data/abstracts_raw.json', 'r', encoding='utf-8') as f:
    abstracts = json.load(f)


### Load model

In [10]:
nlp = spacy.load("en_ner_bc5cdr_md")

### Clean text

In [11]:
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

### Extract entities

In [21]:
print("Extracting entities from abstracts...")
entity_rows = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    
    if not text:
        continue
    
    doc = nlp(text)
    
    for ent in doc.ents:
        # BC5CDR labels: CHEMICAL (drugs), DISEASE (diseases/symptoms)
        entity_type = 'Drug' if ent.label_ == 'CHEMICAL' else 'Disease'
        
        entity_rows.append({
            'id': doi,
            'entity': ent.text,
            'entity_type': entity_type,
            'original_label': ent.label_
        })

entities_df = pd.DataFrame(entity_rows)
entities_df = entities_df.drop_duplicates(subset=['id', 'entity', 'entity_type'])
entities_df.to_csv('data/entities_extracted.csv', index=False)
print(f"✓ Extracted {len(entities_df)} unique entities")
print(f"  - Drugs: {len(entities_df[entities_df['entity_type'] == 'Drug'])}")
print(f"  - Diseases: {len(entities_df[entities_df['entity_type'] == 'Disease'])}")

Extracting entities from abstracts...
✓ Extracted 109 unique entities
  - Drugs: 38
  - Diseases: 71


### Using PhraseMatcher to locate multi-worded entities

In [13]:
def get_entity_tokens(doc, entity_text):
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    pattern = nlp(entity_text)
    matcher.add("ENTITY", [pattern])
    matches = matcher(doc)
    tokens = []
    for _, start, end in matches:
        tokens.extend(doc[start:end])
    return tokens

### Map entities to sentences and generate edges

In [23]:
print("\nExtracting relations between entities...")

# Define relationship patterns
RELATION_PATTERNS = {
    'treats': [
        r'\b(treat|treated|treating|treatment|therapy|therapeutic)\b',
        r'\b(effective|efficacy|administered|prescribed)\b',
        r'\b(cure|cures|curing|remedy)\b'
    ],
    'causes': [
        r'\b(cause|caused|causing|induce|induced|inducing)\b',
        r'\b(lead to|leads to|result in|results in)\b',
        r'\b(associated with|linked to|related to)\b'
    ],
    'associated_with': [
        r'\b(associated|correlation|correlated|relationship)\b',
        r'\b(found in|observed in|detected in)\b',
        r'\b(present in|prevalence)\b'
    ]
}

def infer_relation(sentence, source_type, target_type):
    """Infer relation type based on entity types and sentence patterns"""
    sentence_lower = sentence.lower()
    
    # Drug -> Disease = treats
    if source_type == 'Drug' and target_type == 'Disease':
        for pattern in RELATION_PATTERNS['treats']:
            if re.search(pattern, sentence_lower):
                return 'treats'
        return 'associated_with'
    
    # Disease -> Drug = treated_by (reverse of treats)
    elif source_type == 'Disease' and target_type == 'Drug':
        for pattern in RELATION_PATTERNS['treats']:
            if re.search(pattern, sentence_lower):
                return 'treated_by'
        return 'associated_with'
    
    # Disease -> Disease = causes or associated_with
    elif source_type == 'Disease' and target_type == 'Disease':
        for pattern in RELATION_PATTERNS['causes']:
            if re.search(pattern, sentence_lower):
                return 'causes'
        return 'associated_with'
    
    # Default
    return 'associated_with'

edges = []

for abstract in abstracts:
    doi = abstract['rel_doi']
    text = clean_text(abstract.get('rel_abs', ''))
    
    if not text:
        continue
    
    sentences = sent_tokenize(text)
    abstract_entities = entities_df[entities_df['id'] == doi]
    
    for sent in sentences:
        # Find entities in this sentence
        entities_in_sentence = []
        for _, row in abstract_entities.iterrows():
            if row['entity'] in sent:
                entities_in_sentence.append(row)
        
        # Create relations for entity pairs
        if len(entities_in_sentence) > 1:
            for i, e1 in enumerate(entities_in_sentence):
                for e2 in entities_in_sentence[i+1:]:
                    source = e1['entity']
                    target = e2['entity']
                    source_type = e1['entity_type']
                    target_type = e2['entity_type']
                    
                    # Infer relation
                    relation = infer_relation(sent, source_type, target_type)
                    
                    # Calculate confidence based on sentence proximity
                    source_pos = sent.find(source)
                    target_pos = sent.find(target)
                    distance = abs(source_pos - target_pos)
                    confidence = 1 / (1 + distance / 100)  # Normalized distance score
                    
                    edges.append({
                        'source': source,
                        'target': target,
                        'source_type': source_type,
                        'target_type': target_type,
                        'relation': relation,
                        'confidence': confidence,
                        'sentence': sent[:200],  # Truncate for readability
                        'doi': doi
                    })

edges_df = pd.DataFrame(edges)
edges_df = edges_df.drop_duplicates(subset=['source', 'target', 'relation'])
edges_df.to_csv('data/relations.csv', index=False)
print(f"✓ Generated {len(edges_df)} relations")

# Print relation statistics
print("\nRelation types distribution:")
for rel, count in edges_df['relation'].value_counts().items():
    print(f"  - {rel}: {count}")


Extracting relations between entities...
✓ Generated 80 relations

Relation types distribution:
  - associated_with: 64
  - causes: 10
  - treated_by: 3
  - treats: 3
