In [None]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 2.8/2.8 MB 32.8 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85
Note: you may need to restart the kernel to use updated packages.


# Project Idea: Relation Extraction on PubMed Articles About Alzheimer's Disease

## 🎯 Objective:
Extract and categorize relationships from PubMed abstracts or full-texts related to Alzheimer's disease.

Fetch PubMed Abstracts

In [1]:
from Bio import Entrez
from time import sleep

# Set your email — this is required by NCBI
Entrez.email = "your_email@example.com"

def fetch_pubmed_abstracts(query, max_results=100):
    # Search PubMed
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    id_list = record["IdList"]
    print(f"Found {len(id_list)} articles for query: '{query}'")

    abstracts = []
    for pmid in id_list:
        try:
            # Fetch article metadata
            fetch_handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
            abstract_text = fetch_handle.read()
            abstracts.append((pmid, abstract_text))
            fetch_handle.close()
            sleep(0.5)  # Be kind to NCBI servers
        except Exception as e:
            print(f"Error fetching PMID {pmid}: {e}")
            continue

    return abstracts

# Run the script
query = "Alzheimer's disease"
abstracts = fetch_pubmed_abstracts(query, max_results=100)

# Save to file
with open("alzheimers_pubmed_abstracts.txt", "w", encoding="utf-8") as f:
    for pmid, abstract in abstracts:
        f.write(f"PMID: {pmid}\n")
        f.write(abstract + "\n\n")


Found 100 articles for query: 'Alzheimer's disease'


### Step 1: NER (Entity Detection)
Use SciSpacy or BioBERT to find:

- DISEASE (e.g., Alzheimer’s)

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

def split_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


In [6]:
with open("alzheimers_pubmed_abstracts.txt", "r", encoding="utf-8") as f:
    abstracts = f.read().split("PMID:")

all_sentences = []
for entry in abstracts:
    if entry.strip():
        text = entry.strip().split("\n", 1)[-1]
        sentences = split_sentences(text)
        all_sentences.extend(sentences)

print("Total sentences:", len(all_sentences))


Total sentences: 2534


In [7]:
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_ner_bc5cdr_md")
nlp.add_pipe("abbreviation_detector")

def get_disease_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "DISEASE"]



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [8]:
def normalize_disease_name(name):
    name = name.lower()
    if "alzheimer" in name:
        return "Alzheimer's disease"
    return name


In [9]:

def extract_alz_related_pairs(sentence):
    doc = nlp(sentence)
    diseases = [normalize_disease_name(ent.text) for ent in doc.ents if ent.label_ == "DISEASE"]
    
    # Get unique lowercase disease list (for robust matching)
    unique_diseases = list(set(diseases))
    alz_diseases = [d for d in unique_diseases if "alzheimer" in d.lower()]
    
    pairs = []
    for ad in alz_diseases:
        for d in unique_diseases:
            if d.lower() != ad.lower():
                pairs.append((ad, d))
    return pairs


In [10]:

import csv
output_rows = []

for sent in all_sentences:
    pairs = extract_alz_related_pairs(sent)
    for p1, p2 in pairs:
        output_rows.append({
            "sentence": sent,
            "entity_1": p1,
            "entity_2": p2,
            "relation_label": ""  # ← leave empty for annotation
        })


  global_matches = self.global_matcher(doc)


In [11]:
unique_rows = set()
deduplicated = []

for row in output_rows:
    key = (row["sentence"], row["entity_1"].lower(), row["entity_2"].lower())
    if key not in unique_rows:
        unique_rows.add(key)
        deduplicated.append(row)

print(f"After deduplication: {len(deduplicated)} pairs")


After deduplication: 202 pairs


In [13]:
with open("alz_disease_pairs_for_annotation.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["sentence", "entity_1", "entity_2", "relation_label"])
    writer.writeheader()
    writer.writerows(deduplicated)

print(f"Saved {len(deduplicated)} pairs to 'alz_disease_pairs_for_annotation.csv'")

Saved 202 pairs to 'alz_disease_pairs_for_annotation.csv'
