In [None]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 2.8/2.8 MB 32.8 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85
Note: you may need to restart the kernel to use updated packages.


# Project Idea: Relation Extraction on PubMed Articles About Alzheimer's Disease

## 🎯 Objective:
Extract and categorize relationships from PubMed abstracts or full-texts related to Alzheimer's disease.

Fetch PubMed Abstracts

In [1]:
from Bio import Entrez
from time import sleep

# Set your email — this is required by NCBI
Entrez.email = "your_email@example.com"

def fetch_pubmed_abstracts(query, max_results=100):
    # Search PubMed
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    id_list = record["IdList"]
    print(f"Found {len(id_list)} articles for query: '{query}'")

    abstracts = []
    for pmid in id_list:
        try:
            # Fetch article metadata
            fetch_handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
            abstract_text = fetch_handle.read()
            abstracts.append((pmid, abstract_text))
            fetch_handle.close()
            sleep(0.5)  # Be kind to NCBI servers
        except Exception as e:
            print(f"Error fetching PMID {pmid}: {e}")
            continue

    return abstracts

# Run the script
query = "Alzheimer's disease"
abstracts = fetch_pubmed_abstracts(query, max_results=100)

# Save to file
with open("alzheimers_pubmed_abstracts.txt", "w", encoding="utf-8") as f:
    for pmid, abstract in abstracts:
        f.write(f"PMID: {pmid}\n")
        f.write(abstract + "\n\n")


Found 100 articles for query: 'Alzheimer's disease'


### Step 1: NER (Entity Detection)
Use SciSpacy or BioBERT to find:

- DISEASE (e.g., Alzheimer’s)

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

def split_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


In [6]:
with open("alzheimers_pubmed_abstracts.txt", "r", encoding="utf-8") as f:
    abstracts = f.read().split("PMID:")

all_sentences = []
for entry in abstracts:
    if entry.strip():
        text = entry.strip().split("\n", 1)[-1]
        sentences = split_sentences(text)
        all_sentences.extend(sentences)

print("Total sentences:", len(all_sentences))


Total sentences: 2534


In [18]:
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_ner_bc5cdr_md")
nlp.add_pipe("abbreviation_detector")

def get_disease_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "DISEASE"]



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [22]:
def normalize_disease_name(name):
    name = name.lower()
    if "alzheimer" in name:
        return "Alzheimer's disease"
    return name


In [None]:

def extract_alz_related_pairs(sentence):
    doc = nlp(sentence)
    diseases = [normalize_disease_name(ent.text) for ent in doc.ents if ent.label_ == "DISEASE"]
    
    # Get unique lowercase disease list (for robust matching)
    unique_diseases = list(set(diseases))
    alz_diseases = [d for d in unique_diseases if "alzheimer" in d.lower()]
    
    pairs = []
    for ad in alz_diseases:
        for d in unique_diseases:
            if d.lower() != ad.lower():
                pairs.append((ad, d))
    return pairs

In [10]:

import csv
output_rows = []

for sent in all_sentences:
    pairs = extract_alz_related_pairs(sent)
    for p1, p2 in pairs:
        output_rows.append({
            "sentence": sent,
            "entity_1": p1,
            "entity_2": p2,
            "relation_label": ""  # ← leave empty for annotation
        })


  global_matches = self.global_matcher(doc)


In [11]:
unique_rows = set()
deduplicated = []

for row in output_rows:
    key = (row["sentence"], row["entity_1"].lower(), row["entity_2"].lower())
    if key not in unique_rows:
        unique_rows.add(key)
        deduplicated.append(row)

print(f"After deduplication: {len(deduplicated)} pairs")


After deduplication: 202 pairs


In [13]:
with open("alz_disease_pairs_for_annotation.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["sentence", "entity_1", "entity_2", "relation_label"])
    writer.writeheader()
    writer.writerows(deduplicated)

print(f"Saved {len(deduplicated)} pairs to 'alz_disease_pairs_for_annotation.csv'")

Saved 202 pairs to 'alz_disease_pairs_for_annotation.csv'


Add original entities back for input formatting.

In [27]:
import pandas as pd
import spacy

# Load SciSpacy model
nlp = spacy.load("en_ner_bc5cdr_md")

# Your normalization function
def normalize_disease_name(name):
    name = name.lower()
    if "alzheimer" in name:
        return "Alzheimer's disease"
    return name

# Load your annotated dataset
df = pd.read_csv("alz_disease_pairs_for_annotation.csv", encoding="ISO-8859-1")

# Initialize empty lists to store original values
entity_1_original_list = []
entity_2_original_list = []

# Loop through each labeled row and match back original texts
for _, row in df.iterrows():
    sentence = row["sentence"]
    norm_e1 = row["entity_1"]
    norm_e2 = row["entity_2"]

    doc = nlp(sentence)
    found_e1 = None
    found_e2 = None

    # Try to match original mentions by normalization
    for ent in doc.ents:
        if ent.label_ == "DISEASE":
            norm_ent = normalize_disease_name(ent.text)
            if norm_ent == norm_e1 and not found_e1:
                found_e1 = ent.text
            elif norm_ent == norm_e2 and not found_e2:
                found_e2 = ent.text

    entity_1_original_list.append(found_e1 if found_e1 else "")
    entity_2_original_list.append(found_e2 if found_e2 else "")

# Add new columns to your original DataFrame
df["entity_1_original"] = entity_1_original_list
df["entity_2_original"] = entity_2_original_list

# Save new file
df.to_csv("alz_disease_pairs_with_original.csv", index=False, encoding="utf-8")

print("✅ Done! Original entity names added and file saved as 'alz_disease_pairs_with_original.csv'.")


✅ Done! Original entity names added and file saved as 'alz_disease_pairs_with_original.csv'.


### Step 2: Format for Training
Convert each row into a classifier input by highlighting the entities.

Example input format:
"[E1] Diabetes [/E1] is a known risk factor for [E2] Alzheimer's [/E2]."

In [43]:
import pandas as pd


df = pd.read_csv("alz_disease_pairs_with_original.csv", encoding='ISO-8859-1')


def mark_entities(row):
    text = row["sentence"]
    e1 = row["entity_1_original"]
    e2 = row["entity_2_original"]
    e1_norm = row["entity_1"]
    e2_norm = row["entity_2"]
    
    if pd.isna(e1):
        e1 = str(e1_norm)
    else:
        e1 = str(e1)

    if pd.isna(e2):
        e2= str(e2_norm)
    else:
        e2 = str(e2)


    # Replace first match of each entity with markers
    text = text.replace(e1, f"[E1] {e1} [/E1]", 1)
    text = text.replace(e2, f"[E2] {e2} [/E2]", 1)
    return text

df["input_text"] = df.apply(mark_entities, axis=1)

In [44]:
pd.set_option('display.max_colwidth', None)
print(df["input_text"].head())  # Shows first 5 rows

0          Exploring emotion recognition in patients with mild cognitive impairment and \n[E1] Alzheimer's [E2] dementia [/E2] [/E1] undergoing a rehabilitation program emotion recognition in \npatients with dementia.
1          Exploring emotion recognition in patients with mild [E2] cognitive impairment [/E2] and \n[E1] Alzheimer's dementia [/E1] undergoing a rehabilitation program emotion recognition in \npatients with dementia.
2    AIM: This study aimed to explore differences in the emotions of patients with \nmild cognitive impairment ([E2] MCI [/E2]) and [E1] Alzheimer's dementia [/E1] (AD) in group \nrehabilitation using facial analysis.
3    AIM: This study aimed to explore differences in the emotions of patients with \nmild [E2] cognitive impairment [/E2] (MCI) and [E1] Alzheimer's dementia [/E1] (AD) in group \nrehabilitation using facial analysis.
4    AIM: This study aimed to explore differences in the emotions of patients with \nmild cognitive impairment (MCI) and [E1] Al

In [45]:
print(len(df["input_text"]))  # Total rows

202


In [46]:
missing_e1 = df[~df["input_text"].str.contains(r"\[E1\]", na=False)]
missing_e2 = df[~df["input_text"].str.contains(r"\[E2\]", na=False)]

print(f"❗ Rows missing [E1]: {len(missing_e1)}")
print(f"❗ Rows missing [E2]: {len(missing_e2)}")


❗ Rows missing [E1]: 2
❗ Rows missing [E2]: 1


Manually format 3 missing tags sentences:

In [50]:
for idx, row in missing_e1.iterrows():
    print(f"\n{idx}: {row['sentence']}")
    print(f"Entity 1: {row['entity_1_original']}")
    print(f"Entity 2: {row['entity_2_original']}")
    print(f"Entity 1 Normalized: {row['entity_1']}")
    print(f"Entity 2 Normalized: {row['entity_2']}")



97: The authors 
declare the following financial interests/personal relationships which may be 
considered as potential competing interests: PCD has received grant support by 
the Medical Research Council, the Lewy Body Society, AlzheimerÃ¢ÂÂs Society and 
Alzheimer's Research UK.
Entity 1: nan
Entity 2: PCD
Entity 1 Normalized: Alzheimer's disease
Entity 2 Normalized: pcd

196: Ethics approval was obtained from the 
institutional review boards of each institution involved: Oregon Health and 
Science University; University of Southern California; University of 
CaliforniaÃ¢ÂÂSan Diego; University of Michigan; Mayo Clinic, Rochester; Baylor 
College of Medicine; Columbia University Medical Center; Washington University, 
St. Louis; University of Alabama at Birmingham; Mount Sinai School of Medicine; 
Rush University Medical Center; Wien Center; Johns Hopkins University; New York 
University; Duke University Medical Center; University of Pennsylvania; 
University of Kentucky; Univer

In [67]:
df.at[97, "input_text"] = "The authors declare the following financial interests/personal relationships which may be considered as potential competing interests: [E2] PCD [/E2] has received grant support by the Medical Research Council, the Lewy Body Society, [E1] Alzheimer [/E1] Ã¢ÂÂs Society and Alzheimer's Research UK."
print(df.at[97, "input_text"])

The authors declare the following financial interests/personal relationships which may be considered as potential competing interests: [E2] PCD [/E2] has received grant support by the Medical Research Council, the Lewy Body Society, [E1] Alzheimer [/E1] Ã¢ÂÂs Society and Alzheimer's Research UK.


In [51]:
for idx, row in missing_e2.iterrows():
    print(f"\n{idx}: {row['sentence']}")
    print(f"Entity 1: {row['entity_1_original']}")
    print(f"Entity 2: {row['entity_2_original']}")
    print(f"Entity 1 Normalized: {row['entity_1']}")
    print(f"Entity 2 Normalized: {row['entity_2']}")



169: Alzheimer's disease (AD) is a progressive neurodegenerative disorder 
characterized by cognitive decline and memory loss, with amyloid-beta (AÃÂ²) 
plaques and acetylcholine deficits being central pathological features.
Entity 1: Alzheimer's disease
Entity 2: nan
Entity 1 Normalized: Alzheimer's disease
Entity 2 Normalized: amyloid-beta (aÃÂ²) 
plaques and acetylcholine


In [None]:
df.at[169, "input_text"] = row["sentence"].replace("Alzheimer's disease", "[E1] Alzheimer's disease [/E1]", 1).replace("amyloid-bet", "[E2] amyloid-bet [/E2]", 1)

In [53]:
print(df.at[169, "input_text"])

[E1] Alzheimer's disease [/E1] (AD) is a progressive neurodegenerative disorder 
characterized by cognitive decline and memory loss, with [E2] amyloid-bet [/E2]a (AÃÂ²) 
plaques and acetylcholine deficits being central pathological features.


In [64]:
# Drop the row with index 196 because it cannot be fixed.
df = df.drop(index=196).reset_index(drop=True)

In [68]:
print(len(df["input_text"]))  # Total rows after dropping

201
