In [1]:
# === MeSH-Based Category Keyword Extraction ===
"""
Notebook: MeSH_Keyword_Extraction.ipynb
Authors: Elizaveta Popova, Negin Babaiha
Institution: University of Bonn, Fraunhofer SCAI
Date: 09/04/2025

Description:
    This notebook parses MeSH descriptor data (desc2025.xml) to extract relevant biomedical terms
    grouped by conceptual categories related to COVID-19 and neurodegeneration.

    Categories include:
        1. Viral Entry and Neuroinvasion
        2. Immune and Inflammatory Response
        3. Neurodegenerative Mechanisms
        4. Vascular Effects
        5. Psychological and Neurological Symptoms
        6. Systemic Cross-Organ Effects

    Output:
        - mesh_category_terms.json (dictionary of category -> keyword list)
"""

import xml.etree.ElementTree as ET
import json
import re
from collections import defaultdict

# === Load MeSH descriptor file ===
tree = ET.parse('../data/MeSh_data/desc2025.xml')  # Update path if necessary
root = tree.getroot()

# === Define core category-matching keywords (seeds) ===
category_keywords = {
    "Viral Entry and Neuroinvasion": [
        "neuroinvasion", "receptor", "ACE2", "blood-brain barrier", "BBB", "virus entry", "olfactory", 
        "retrograde transport", "endocytosis", "direct invasion", "cranial nerve", "neural pathway", 
        "transcribrial", "neurotropic", "trans-synaptic", "neuronal route", "olfactory nerve", 
        "hematogenous", "choroid plexus", "neuronal transmission", "entry into CNS"
    ],
    "Immune and Inflammatory Response": [
        "immune", "cytokine", "inflammation", "interferon", "TNF", "IL-6", "IL6", "cytokine storm", 
        "immune response", "inflammatory mediators", "macrophage", "microglia", "neutrophil", 
        "lymphocyte", "innate immunity", "immune dysregulation", "chemokine", "T cell", "NLRP3", 
        "antibody", "immune activation", "immune imbalance", "immune-mediated", "complement"
    ],
    "Neurodegenerative Mechanisms": [
        "neurodegeneration", "protein aggregation", "apoptosis", "cell death", "synaptic loss", 
        "neurotoxicity", "oxidative stress", "mitochondrial dysfunction", "tau", "amyloid", 
        "α-synuclein", "prion", "demyelination", "neuron loss", "misfolded proteins", 
        "chronic neuronal damage", "neurodegenerative", "neuroinflammation"
    ],
    "Vascular Effects": [
        "stroke", "thrombosis", "vascular", "ischemia", "coagulation", "blood clot", "microthrombi", 
        "endothelial", "vasculitis", "hemorrhage", "blood vessel", "vascular damage", "capillary", 
        "clotting", "hypoperfusion", "angiopathy", "vasculopathy"
    ],
    "Psychological and Neurological Symptoms": [
        "cognitive", "memory", "fatigue", "depression", "anxiety", "brain fog", "psychiatric", 
        "mood", "confusion", "neuropsychiatric", "emotional", "behavioral", "neurocognitive", 
        "insomnia", "psychosocial", "attention", "motivation", "executive function", "suicidality"
    ],
    "Systemic Cross-Organ Effects": [
        "lungs", "liver", "kidney", "systemic", "multi-organ", "gastrointestinal", "heart", 
        "cardiovascular", "endocrine", "renal", "pancreas", "organ failure", "liver damage", 
        "pulmonary", "myocardial", "respiratory", "hypoxia", "oxygen deprivation", "fibrosis"
    ]
}

# === Parse MeSH XML and extract matching terms per category ===
category_terms = defaultdict(set)

for descriptor in root.findall('DescriptorRecord'):
    descriptor_name_el = descriptor.find('DescriptorName/String')
    if descriptor_name_el is None:
        continue

    descriptor_name = descriptor_name_el.text
    term_elements = descriptor.findall('ConceptList/Concept/TermList/Term/String')
    synonyms = [term_el.text for term_el in term_elements if term_el is not None]
    all_text = f"{descriptor_name} " + ' '.join(synonyms)

    for category, keywords in category_keywords.items():
        if any(keyword.lower() in all_text.lower() for keyword in keywords):
            category_terms[category].update([descriptor_name] + synonyms)

# === Convert sets to lists ===
for category in category_terms:
    category_terms[category] = sorted(list(category_terms[category]))

# === Preview sample output ===
category_name = "Immune and Inflammatory Response"
print(f"=== Preview: {category_name} ===")
for term in category_terms[category_name][:25]:  # Show first 25 terms
    print("-", term)

# === Export to JSON ===
output_path = "../data/MeSh_data/mesh_category_terms.json"
with open(output_path, "w") as f:
    json.dump(category_terms, f, indent=2)

print(f"\n✅ Extraction complete! Terms saved to: {output_path}")


=== Preview: Immune and Inflammatory Response ===
- 1, ADP-ribosyl Cyclase
- 1, IFN-gamma Receptor
- 120a Antigen, CD
- 120b Antigen, CD
- 12E7 Antigen
- 12E7 Protein
- 19S Gamma Globulin
- 2, C-EBP-Related Protein
- 23-C-EBP Protein
- 28 kDa Protein, Adipocyte
- 293 Cell, HEK
- 293 Cells, HEK
- 293T Cell
- 293T Cells
- 4 1BB Receptor
- 4 1BB Receptors
- 4-1BB Receptor
- 4-1BB Receptors
- 40-C-EBP Protein
- 4F2 Antigen
- 4F2 Antigen, Human
- 4F2-antigen
- 60B8 A Antigen
- 60B8 B Antigen
- 60B8-A Antigen

✅ Extraction complete! Terms saved to: ../data/MeSh_data/mesh_category_terms.json


FOR TESTING API FOR NOW

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import json
import re
from sentence_transformers import SentenceTransformer, util
import torch
from collections import defaultdict, Counter

In [None]:
# Step 1: Load Data
df = pd.read_csv('../data/Triples_Final_All_Relevant.csv')

with open('../data/MeSh_data/mesh_category_terms.json', 'r') as f:
    category_terms = json.load(f)

# Step 2: Normalize Process Descriptions and Keywords
def normalize_text(text):
    text = re.sub(r"[_\-]", " ", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['Normalized_Process'] = df['Pathophysiological Process'].apply(normalize_text)

for category in category_terms:
    category_terms[category] = [normalize_text(term) for term in category_terms[category]]

# Step 3: Initialize BERT Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 4: Embed Category Keywords (Grouped by Category)
print("Embedding keywords by category...")

category_keyword_embeddings = {}

for category, keywords in category_terms.items():
    if keywords:
        embeddings = model.encode(keywords, convert_to_tensor=True)
        category_keyword_embeddings[category] = embeddings
    else:
        category_keyword_embeddings[category] = None

# Step 5: Function to Classify a Single Process
def bert_keyword_classify(process_text, category_keyword_embeddings, threshold=0.5, aggregation="max"):
    # Embed the process description
    process_embedding = model.encode(process_text, convert_to_tensor=True)

    best_category = None
    best_score = 0.0
    category_scores = {}

    for category, keyword_embeddings in category_keyword_embeddings.items():
        if keyword_embeddings is None or len(keyword_embeddings) == 0:
            continue

        # Compute cosine similarity between process and all keywords for this category
        cosine_scores = util.pytorch_cos_sim(process_embedding, keyword_embeddings)[0]

        # Aggregate scores (options: max, mean, etc.)
        if aggregation == "max":
            score = torch.max(cosine_scores).item()
        elif aggregation == "mean":
            score = torch.mean(cosine_scores).item()
        else:
            raise ValueError("Unsupported aggregation type. Use 'max' or 'mean'.")

        category_scores[category] = score

        # Track best score and category
        if score > best_score:
            best_score = score
            best_category = category

    if best_score >= threshold:
        return best_category
    else:
        return "Uncategorized"

# Step 6: Apply Classifier to the Dataset
print("Classifying processes using BERT + Keywords...")

df['Category_BERT_Keywords'] = df['Normalized_Process'].apply(
    lambda x: bert_keyword_classify(x, category_keyword_embeddings, threshold=0.5, aggregation="max")
)

# Step 7: Count Items per Category
category_counts = Counter(df['Category_BERT_Keywords'])

print("=== Category Counts (BERT + Keywords) ===")
for category, count in category_counts.items():
    print(f"{category}: {count}")

# Step 8: Export Results
df.to_csv('../data/Triples_Final_All_Relevant_Categorized.csv', index=False)
df.to_excel('../data/Triples_Final_All_Relevant_Categorized.xlsx', index=False)

counts_df = pd.DataFrame(category_counts.items(), columns=['Category', 'Count'])
# counts_df.to_excel('Category_Counts_BERT_Keywords.xlsx', index=False)

print("BERT + Keywords classification complete! ✅")


Embedding keywords by category...
Classifying processes using BERT + Keywords...
=== Category Counts (BERT + Keywords) ===
Viral Entry and Neuroinvasion: 619
Immune and Inflammatory Response: 944
Psychological and Neurological Symptoms: 126
Uncategorized: 120
Vascular Effects: 392
Systemic Cross-Organ Effects: 395
Neurodegenerative Mechanisms: 265
BERT + Keywords classification complete! ✅
