In [17]:
import xml.etree.ElementTree as ET
import pandas as pd
import json
import re
from sentence_transformers import SentenceTransformer, util
import torch
from collections import defaultdict, Counter

In [None]:
# Load MeSH descriptors XML
tree = ET.parse('../data/MeSh_data/desc2025.xml')  # Change path if needed
root = tree.getroot()

# Keywords for each category (not final)
# category_keywords = {
#     "Viral Entry and Neuroinvasion": ["neuroinvasion", "receptor", "ACE2", "blood-brain barrier", "virus entry"],
#     "Immune and Inflammatory Response": ["immune", "cytokine", "inflammation", "interferon"],
#     "Neurodegenerative Mechanisms": ["neurodegeneration", "protein aggregation", "cell death", "apoptosis"],
#     "Vascular Effects": ["stroke", "thrombosis", "vascular", "blood clot", "ischemia"],
#     "Psychological and Neurological Symptoms": ["cognitive", "memory", "fatigue", "depression", "anxiety"],
#     "Systemic Cross-Organ Effects": ["lungs", "liver", "multi-organ", "kidney", "systemic"],
# }

category_keywords = {
    "Viral Entry and Neuroinvasion": [
        "neuroinvasion", "receptor", "ACE2", "blood-brain barrier", "BBB", "virus entry", "olfactory", 
        "retrograde transport", "endocytosis", "direct invasion", "cranial nerve", "neural pathway", 
        "transcribrial", "neurotropic", "trans-synaptic", "neuronal route", "olfactory nerve", 
        "hematogenous", "choroid plexus", "neuronal transmission", "entry into CNS"
    ],
    "Immune and Inflammatory Response": [
        "immune", "cytokine", "inflammation", "interferon", "TNF", "IL-6", "IL6", "cytokine storm", 
        "immune response", "inflammatory mediators", "macrophage", "microglia", "neutrophil", 
        "lymphocyte", "innate immunity", "immune dysregulation", "chemokine", "T cell", "NLRP3", 
        "antibody", "immune activation", "immune imbalance", "immune-mediated", "complement"
    ],
    "Neurodegenerative Mechanisms": [
        "neurodegeneration", "protein aggregation", "apoptosis", "cell death", "synaptic loss", 
        "neurotoxicity", "oxidative stress", "mitochondrial dysfunction", "tau", "amyloid", 
        "α-synuclein", "prion", "demyelination", "neuron loss", "misfolded proteins", 
        "chronic neuronal damage", "neurodegenerative", "neuroinflammation"
    ],
    "Vascular Effects": [
        "stroke", "thrombosis", "vascular", "ischemia", "coagulation", "blood clot", "microthrombi", 
        "endothelial", "vasculitis", "hemorrhage", "blood vessel", "vascular damage", "capillary", 
        "clotting", "hypoperfusion", "angiopathy", "vasculopathy"
    ],
    "Psychological and Neurological Symptoms": [
        "cognitive", "memory", "fatigue", "depression", "anxiety", "brain fog", "psychiatric", 
        "mood", "confusion", "neuropsychiatric", "emotional", "behavioral", "neurocognitive", 
        "insomnia", "psychosocial", "attention", "motivation", "executive function", "suicidality"
    ],
    "Systemic Cross-Organ Effects": [
        "lungs", "liver", "kidney", "systemic", "multi-organ", "gastrointestinal", "heart", 
        "cardiovascular", "endocrine", "renal", "pancreas", "organ failure", "liver damage", 
        "pulmonary", "myocardial", "respiratory", "hypoxia", "oxygen deprivation", "fibrosis"
    ]
}


category_terms = defaultdict(set)

# Loop through all DescriptorRecords
for descriptor in root.findall('DescriptorRecord'):
    # Get descriptor name
    descriptor_name_el = descriptor.find('DescriptorName/String')
    if descriptor_name_el is None:
        continue
    
    descriptor_name = descriptor_name_el.text

    # Get all concept/term synonyms
    term_elements = descriptor.findall('ConceptList/Concept/TermList/Term/String')
    concept_terms = [term_el.text for term_el in term_elements if term_el is not None]

    # Combine all terms into a single string for matching
    all_text = f"{descriptor_name} " + ' '.join(concept_terms)

    # Match to categories
    for category, keywords in category_keywords.items():
        if any(keyword.lower() in all_text.lower() for keyword in keywords):
            # Add both descriptor name and synonyms
            category_terms[category].update([descriptor_name] + concept_terms)

# Convert sets to lists
for category in category_terms:
    category_terms[category] = list(category_terms[category])

# Show extracted terms for one category
print("=== Immune and Inflammatory Response Terms ===")
print(category_terms["Immune and Inflammatory Response"])

# Optionally save to JSON
import json
with open("../data/MeSh_data/mesh_category_terms.json", "w") as f:
    json.dump(category_terms, f, indent=2)

print("Extraction complete!")


=== Immune and Inflammatory Response Terms ===
['Antibody Classes', 'CAR T-Cell Therapies', 'Islet Cell Adenoma', 'Protuberans, Giant Dermatofibrosarcoma', 'Receptor, TNFRSF6', 'Complement C3d Fragment', 'CD278 Antigen', 'Autoimmune Urticarias', 'T-Cell Costimulatory Molecule B7x', 'Trained Immunity', 'B-Lymphocyte Heavy Chain Gene Rearrangement', 'Anti Citrullinated Protein Antibodies', 'Host Cell Restriction Factors', 'Tumor Necrosis Factor alpha', 'T Lymphocyte beta-Chain Gene Rearrangement', 'B-1 Cell', 'Chemokine Co receptor 5 Antagonists', 'Receptor, CX3C', 'Interferon-Stimulated Gene Factor 3, gamma Subunit', 'IRF 7C Transcription Factor', 'Antibody, Citrullinated Protein', 'Immune Globulin, Intravenous', 'Interferon alphabeta Receptors', 'CD274 Antigens', 'PD L1 Inhibitor', 'Immune Response Antigens', 'Localized Giant Cell Tumor of the Tendon Sheath', 'Neural Crest Cells', 'Serpin Family G Member 1', 'IL-10-Related T-Cell-Derived-Inducible Factor', 'Carcinoid Tumors', 'Terminal

In [None]:
# Step 1: Load Data
df = pd.read_csv('../data/Triples_Final_All_Relevant.csv')

with open('../data/MeSh_data/mesh_category_terms.json', 'r') as f:
    category_terms = json.load(f)

# Step 2: Normalize Process Descriptions and Keywords
def normalize_text(text):
    text = re.sub(r"[_\-]", " ", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['Normalized_Process'] = df['Pathophysiological Process'].apply(normalize_text)

for category in category_terms:
    category_terms[category] = [normalize_text(term) for term in category_terms[category]]

# Step 3: Initialize BERT Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 4: Embed Category Keywords (Grouped by Category)
print("Embedding keywords by category...")

category_keyword_embeddings = {}

for category, keywords in category_terms.items():
    if keywords:
        embeddings = model.encode(keywords, convert_to_tensor=True)
        category_keyword_embeddings[category] = embeddings
    else:
        category_keyword_embeddings[category] = None

# Step 5: Function to Classify a Single Process
def bert_keyword_classify(process_text, category_keyword_embeddings, threshold=0.5, aggregation="max"):
    # Embed the process description
    process_embedding = model.encode(process_text, convert_to_tensor=True)

    best_category = None
    best_score = 0.0
    category_scores = {}

    for category, keyword_embeddings in category_keyword_embeddings.items():
        if keyword_embeddings is None or len(keyword_embeddings) == 0:
            continue

        # Compute cosine similarity between process and all keywords for this category
        cosine_scores = util.pytorch_cos_sim(process_embedding, keyword_embeddings)[0]

        # Aggregate scores (options: max, mean, etc.)
        if aggregation == "max":
            score = torch.max(cosine_scores).item()
        elif aggregation == "mean":
            score = torch.mean(cosine_scores).item()
        else:
            raise ValueError("Unsupported aggregation type. Use 'max' or 'mean'.")

        category_scores[category] = score

        # Track best score and category
        if score > best_score:
            best_score = score
            best_category = category

    if best_score >= threshold:
        return best_category
    else:
        return "Uncategorized"

# Step 6: Apply Classifier to the Dataset
print("Classifying processes using BERT + Keywords...")

df['Category_BERT_Keywords'] = df['Normalized_Process'].apply(
    lambda x: bert_keyword_classify(x, category_keyword_embeddings, threshold=0.5, aggregation="max")
)

# Step 7: Count Items per Category
category_counts = Counter(df['Category_BERT_Keywords'])

print("=== Category Counts (BERT + Keywords) ===")
for category, count in category_counts.items():
    print(f"{category}: {count}")

# Step 8: Export Results
df.to_csv('../data/Triples_Final_All_Relevant_Categorized_BERT_Keywords_mesh.csv', index=False)
df.to_excel('../data/Triples_Final_All_Relevant_Categorized_BERT_Keywords_mesh.xlsx', index=False)

counts_df = pd.DataFrame(category_counts.items(), columns=['Category', 'Count'])
# counts_df.to_excel('Category_Counts_BERT_Keywords.xlsx', index=False)

print("BERT + Keywords classification complete! ✅")


Embedding keywords by category...
Classifying processes using BERT + Keywords...
=== Category Counts (BERT + Keywords) ===
Viral Entry and Neuroinvasion: 619
Immune and Inflammatory Response: 944
Psychological and Neurological Symptoms: 126
Uncategorized: 120
Vascular Effects: 392
Systemic Cross-Organ Effects: 395
Neurodegenerative Mechanisms: 265
BERT + Keywords classification complete! ✅


In [20]:
df

Unnamed: 0,URL,Pathophysiological Process,Subject,Predicate,Object,Normalized_Process,Category_BERT_Keywords
0,https://media.springernature.com/lw685/springe...,Viral_Entry_Through_Olfactory_Neurons,SARS-CoV-2,enters,olfactory_neurons,viral entry through olfactory neurons,Viral Entry and Neuroinvasion
1,https://media.springernature.com/lw685/springe...,Viral_Entry_Through_Olfactory_Neurons,Olfactory_neurons,connect_to,olfactory_bulb,viral entry through olfactory neurons,Viral Entry and Neuroinvasion
2,https://media.springernature.com/lw685/springe...,Viral_Entry_Through_Olfactory_Neurons,SARS-CoV-2,reaches,brain,viral entry through olfactory neurons,Viral Entry and Neuroinvasion
3,https://media.springernature.com/lw685/springe...,Blood-Brain_Barrier_Disruption,SARS-CoV-2,binds_to,ACE2,blood brain barrier disruption,Viral Entry and Neuroinvasion
4,https://media.springernature.com/lw685/springe...,Blood-Brain_Barrier_Disruption,ACE2,located_on,endothelial_cells,blood brain barrier disruption,Viral Entry and Neuroinvasion
...,...,...,...,...,...,...,...
2856,https://www.aging-us.com/article/202136/figure...,Arrhythmia,SARS-CoV-2_infection,causes,arrhythmia,arrhythmia,Systemic Cross-Organ Effects
2857,https://www.aging-us.com/article/202136/figure...,Heart_Failure,SARS-CoV-2_infection,causes,heart_failure,heart failure,Systemic Cross-Organ Effects
2858,https://www.aging-us.com/article/202136/figure...,High_Expression_of_PKC,SARS-CoV-2_infection,increases,high_expression_of_PKC,high expression of pkc,Immune and Inflammatory Response
2859,https://www.aging-us.com/article/202136/figure...,Inflammatory_Cytokine_Storm,SARS-CoV-2_infection,triggers,inflammatory_cytokine_storm,inflammatory cytokine storm,Immune and Inflammatory Response
