In [1]:
# ===================================================================
# 1. USER CONFIGURATION
# ===================================================================
# Please provide the full path to your input Excel file.
input_file_path = '/input/file/path.xlsx'

# Please provide the path to the folder where you want to save the results.
output_directory = '/output/directory'

# Provide a base name for the output file. A timestamp will be added automatically.
output_file_name = 'Pathway_Mappin.xlsx'

# (Optional) Specify the exact name of the column containing metabolite names.
# If you set this to None, the script will try to auto-detect a column
# containing 'name', 'metabolite', or 'compound' in its title.
# Example: metabolite_column_name = 'Metabolite Name'
metabolite_column_name = None


# ===================================================================
# --- No need to edit below this line for configuration ---
# ===================================================================
import os
from datetime import datetime

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print(f"Created output directory: {output_directory}")

# Generate the full timestamped output path
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename, extension = os.path.splitext(output_file_name)
timestamped_output_path = os.path.join(output_directory, f"{filename}_{timestamp}{extension}")

# Print confirmation of the settings
print("--- Configuration Summary ---")
print(f"Input File: {input_file_path}")
print(f"Output will be saved to: {timestamped_output_path}")
if metabolite_column_name:
    print(f"Metabolite Column: '{metabolite_column_name}'")
else:
    print("Metabolite Column: Will be auto-detected.")
print("--------------------------")

--- Configuration Summary ---
Input File: /users/aranpurdy/desktop/cfps/pca/rf/MOD_RF_Imputed.xlsx
Output will be saved to: /users/aranpurdy/desktop/TEST/TEST_Pathways_20250806_110555.xlsx
Metabolite Column: Will be auto-detected.
--------------------------


In [2]:
import pandas as pd
import requests
import re
import threading
from collections import defaultdict, Counter
import time
from urllib.parse import quote
import json
import logging
import os
from datetime import datetime
import traceback

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configure session for reuse across requests
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Metabolite Pathway Research Tool)'
})

# =====================================
# Core Name Normalization Functions
# =====================================

def normalize_metabolite_name(name):
    """
    Normalize metabolite names for better matching across databases
    """
    if not isinstance(name, str):
        return []
    
    # General normalization for all metabolites
    normalized = name.strip()

    # Remove special characters that might interfere with API searches
    normalized = re.sub(r'[^\w\s:-]', '', normalized)

    # Handle common naming variations
    normalized = normalized.replace('-', ' ')

    # Create a list of name variations to try
    variations = [normalized]

    # Add the original name
    if name != normalized:
        variations.append(name)

    # Generate variations without numbers/annotations
    base_name = re.sub(r'\s+\d+.*$', '', normalized)
    if base_name != normalized:
        variations.append(base_name)

    # Add version without parentheses content
    no_parentheses = re.sub(r'\([^)]*\)', '', normalized).strip()
    if no_parentheses != normalized:
        variations.append(no_parentheses)
        
    # Add variation with hyphens replaced by spaces and vice versa
    if '-' in normalized:
        variations.append(normalized.replace('-', ' '))
    if ' ' in normalized:
        variations.append(normalized.replace(' ', '-'))
    
    # Add lowercase version
    variations.append(normalized.lower())
    
    # Add version without isomer designation
    isomer_removed = re.sub(r'[DL]-', '', normalized)
    if isomer_removed != normalized:
        variations.append(isomer_removed)
    
    # Add version without acid/salt designation
    acid_removed = re.sub(r'\s+(acid|sulfate|hydrochloride)$', '', normalized, flags=re.IGNORECASE)
    if acid_removed != normalized:
        variations.append(acid_removed)
    
    # Add version with D/L prefix moved to end (many databases list L-Alanine as Alanine, L-)
    dl_match = re.match(r'^([DL])-(.+)$', normalized)
    if dl_match:
        prefix, name_part = dl_match.groups()
        variations.append(f"{name_part}, {prefix}-")
        variations.append(f"{name_part} {prefix}-")
    
    return list(set(variations))

def normalize_lipid_name(name):
    """
    Enhanced lipid name normalization with comprehensive mappings
    """
    if not isinstance(name, str):
        return []
        
    # Dictionary of common lipid class abbreviations
    lipid_class_dict = {
        "SM": "Sphingomyelin",
        "PC": "Phosphatidylcholine",
        "PE": "Phosphatidylethanolamine",
        "PS": "Phosphatidylserine",
        "PI": "Phosphatidylinositol",
        "PG": "Phosphatidylglycerol",
        "PA": "Phosphatidic acid",
        "LPC": "Lysophosphatidylcholine",
        "LPE": "Lysophosphatidylethanolamine",
        "TAG": "Triacylglycerol",
        "DAG": "Diacylglycerol",
        "MAG": "Monoacylglycerol",
        "CER": "Ceramide",
        "Cer": "Ceramide",
        "GlcCer": "Glucosylceramide",
        "LacCer": "Lactosylceramide",
        "CE": "Cholesteryl ester",
        "FA": "Fatty acid",
        "DG": "Diacylglycerol",
        "TG": "Triacylglycerol",
        "PIP": "Phosphatidylinositol phosphate",
        "PIP2": "Phosphatidylinositol bisphosphate",
        "PIP3": "Phosphatidylinositol trisphosphate",
        "LPA": "Lysophosphatidic acid",
        "LPI": "Lysophosphatidylinositol",
        "LPS": "Lysophosphatidylserine",
        "MGDG": "Monogalactosyldiacylglycerol",
        "DGDG": "Digalactosyldiacylglycerol",
        "SQDG": "Sulfoquinovosyldiacylglycerol",
        "HexCer": "Hexosylceramide",
        "Hex2Cer": "Dihexosylceramide",
        "GM3": "Ganglioside GM3",
        "GM2": "Ganglioside GM2",
        "GM1": "Ganglioside GM1",
        "S1P": "Sphingosine-1-phosphate",
        "NAPE": "N-acylphosphatidylethanolamine",
        "CoA": "Coenzyme A",
        "AcylCoA": "Acyl-Coenzyme A",
        "plasmenyl-PC": "Plasmalogen phosphatidylcholine",
        "plasmenyl-PE": "Plasmalogen phosphatidylethanolamine",
        "lysoPE": "Lysophosphatidylethanolamine",
        "lysoPC": "Lysophosphatidylcholine",
        "LysoPC": "Lysophosphatidylcholine"
    }

    # Multiple patterns to match different lipid naming conventions
    patterns = [
        r'^([\w-]+)\s+(\d+):(\d+)(?:\s+(.+))?$',  # Standard pattern: "PC 36:2" or "PC 36:2 HETE"
        r'^([\w-]+)\((\d+):(\d+)(?:\s+(.+))?\)$',  # Parentheses pattern: "PC(36:2)"
        r'^([\w-]+)-(\d+):(\d+)(?:\s+(.+))?$',     # Hyphenated pattern: "PC-36:2"
        r'^([\w-]+)_(\d+)_(\d+)(?:_(.+))?$',      # Underscore pattern: "PC_36_2"
        r'^([\w-]+)\s+(\d+):(\d+)[:-](\d+)(?:\s+(.+))?$',  # Pattern with extra number: "PC 36:2:1"
        r'^([\w-]+)\s+(\d+):(\d+)\s+([OPH]+)$',   # Hydroxylation pattern: "PC 36:2 OH"
        r'^([\w-]+)\s+(\d+):(\d+)\s+(\d+)$'        # Pattern with extra number: "GlcCer 36:00"
    ]

    name_variations = []

    # Check for specific patterns for glycerophospholipid structures with explicit fatty acid positions
    positional_match = re.match(r'^1,2-Di(.+)-sn-glycero-3-phospho(.+)', name)
    if positional_match:
        # Extract fatty acid and head group
        fatty_acid, head_group = positional_match.groups()
        
        # Handle common head groups
        head_group_map = {
            "ethanolamine": "PE",
            "choline": "PC",
            "serine": "PS",
            "inositol": "PI"
        }
        
        if head_group.lower() in head_group_map:
            lipid_class = head_group_map[head_group.lower()]
            
            # Add variations
            name_variations.append(lipid_class)
            if lipid_class in lipid_class_dict:
                name_variations.append(lipid_class_dict[lipid_class])
            
            # Try to extract carbon count from fatty acid name
            fa_match = re.search(r'([a-z]+)([0-9]+)', fatty_acid.lower())
            if fa_match:
                fa_name, carbon_count = fa_match.groups()
                # Calculate total carbons (assuming di-acyl)
                total_carbons = int(carbon_count) * 2
                name_variations.append(f"{lipid_class} {total_carbons}:0")
                if lipid_class in lipid_class_dict:
                    name_variations.append(f"{lipid_class_dict[lipid_class]} {total_carbons}:0")
            
            # Add all possible variations of the original name
            name_variations.extend([
                "glycerophospholipid",
                f"1,2-Di{fatty_acid}-sn-glycero-3-phospho{head_group}",
                f"1,2 Di{fatty_acid} sn glycero 3 phospho{head_group}"
            ])
            
            return list(set(name_variations))

    # Handle LysoPC naming convention
    lyso_match = re.match(r'^LysoPC\s+\((\d+):(\d+)\)$', name)
    if lyso_match:
        carbon_count, double_bond_count = lyso_match.groups()
        name_variations = [
            f"LPC {carbon_count}:{double_bond_count}",
            f"LysoPC {carbon_count}:{double_bond_count}",
            f"Lysophosphatidylcholine {carbon_count}:{double_bond_count}",
            "Lysophosphatidylcholine",
            "LysoPC",
            "LPC"
        ]
        return list(set(name_variations))

    # Handle GlcCer patterns specifically
    glccer_match = re.match(r'^GlcCer\s+(\d+):(\d+)(?:\s+(.+))?$', name)
    if glccer_match:
        carbon_count, double_bond_count, suffix = glccer_match.groups() if len(glccer_match.groups()) == 3 else (*glccer_match.groups(), None)
        name_variations = [
            "GlcCer",
            "Glucosylceramide",
            "Hexosylceramide",
            f"GlcCer {carbon_count}:{double_bond_count}",
            f"Glucosylceramide {carbon_count}:{double_bond_count}",
            f"Hexosylceramide {carbon_count}:{double_bond_count}"
        ]
        
        # Add hydroxylated versions if applicable
        if suffix and "OH" in suffix:
            name_variations.extend([
                "Hydroxy-GlcCer",
                "Hydroxy-Glucosylceramide",
                "Hydroxy-Hexosylceramide"
            ])
        
        return list(set(name_variations))

    # Try each standard lipid pattern
    for pattern in patterns:
        match = re.match(pattern, name)
        if match:
            groups = match.groups()
            lipid_class = groups[0]
            carbon_count = groups[1]
            double_bond_count = groups[2]
            
            # Additional descriptor (like HETE) if present
            descriptor = groups[3] if len(groups) > 3 and groups[3] else ""
            
            # List of possible names for the same lipid
            name_variations.append(lipid_class)
            
            # Add full name if available
            if lipid_class in lipid_class_dict:
                full_name = lipid_class_dict[lipid_class]
                name_variations.append(full_name)
                
                # Add different formatting variations
                name_variations.append(f"{full_name}({carbon_count}:{double_bond_count})")
                name_variations.append(f"{full_name} {carbon_count}:{double_bond_count}")
                name_variations.append(f"{full_name} {carbon_count}-{double_bond_count}")
                name_variations.append(f"{full_name}-{carbon_count}:{double_bond_count}")
                
                # Add variation with specific lipid species
                total_carbons = int(carbon_count)
                unsaturations = int(double_bond_count)
                
                # Try standard fatty acid patterns if applicable
                if total_carbons <= 24:  # Standard fatty acid range
                    name_variations.append(f"C{carbon_count}:{double_bond_count}")
                    name_variations.append(f"C{carbon_count}:{double_bond_count} {full_name}")
            
            # Add formatted versions
            name_variations.append(f"{lipid_class}({carbon_count}:{double_bond_count})")
            name_variations.append(f"{lipid_class} {carbon_count}:{double_bond_count}")
            name_variations.append(f"{lipid_class}-{carbon_count}:{double_bond_count}")
            
            # Add versions with the descriptor if present
            if descriptor:
                name_variations.append(f"{lipid_class} {descriptor}")
                if lipid_class in lipid_class_dict:
                    name_variations.append(f"{lipid_class_dict[lipid_class]} {descriptor}")
            
            # Try common fat-specific naming convention
            if lipid_class in ["FA", "TAG", "DAG", "MAG"]:
                name_variations.append(f"C{carbon_count}:{double_bond_count}")
                name_variations.append(f"C{carbon_count}H{2*int(carbon_count)-2*int(double_bond_count)}")
            
            return list(set(name_variations))  # Remove duplicates

    # Special case for fatty acids named like "C16:0" or "Palmitic Acid" or "Pentadecanoic Acid (15:0)"
    fa_match = re.match(r'^C(\d+):(\d+)$', name)
    if fa_match:
        carbon_count, double_bond_count = fa_match.groups()
        name_variations = [
            name,
            f"Fatty acid {carbon_count}:{double_bond_count}",
            f"FA {carbon_count}:{double_bond_count}",
            f"FA({carbon_count}:{double_bond_count})"
        ]
        
        # Try systematic names for common fatty acids
        if carbon_count == "16" and double_bond_count == "0":
            name_variations.extend(["Palmitic acid", "Hexadecanoic acid"])
        elif carbon_count == "18" and double_bond_count == "0":
            name_variations.extend(["Stearic acid", "Octadecanoic acid"])
        elif carbon_count == "18" and double_bond_count == "1":
            name_variations.extend(["Oleic acid", "9-Octadecenoic acid"])
        elif carbon_count == "18" and double_bond_count == "2":
            name_variations.extend(["Linoleic acid", "9,12-Octadecadienoic acid"])
        elif carbon_count == "20" and double_bond_count == "4":
            name_variations.extend(["Arachidonic acid", "5,8,11,14-Eicosatetraenoic acid"])
        elif carbon_count == "15" and double_bond_count == "0":
            name_variations.extend(["Pentadecanoic acid", "Pentadecylic acid"])
        elif carbon_count == "12" and double_bond_count == "0":
            name_variations.extend(["Lauric acid", "Dodecanoic acid"])
        
        return list(set(name_variations))
    
    # Check for fatty acid common names
    fa_common_name_match = re.match(r'^([A-Za-z]+)(?:ic)?\s+[Aa]cid(?:\s+\((\d+):(\d+)\))?$', name)
    if fa_common_name_match:
        groups = fa_common_name_match.groups()
        fa_name = groups[0].lower()
        
        # Map of common fatty acid names to their carbon:double_bond notation
        fa_common_names = {
            "palmit": "16:0", "stear": "18:0", "ole": "18:1", "linole": "18:2",
            "arachidon": "20:4", "pentadecan": "15:0", "petrosel": "18:1",
            "laur": "12:0", "myrist": "14:0", "linolen": "18:3",
            "eicosapentaen": "20:5", "docosahexaen": "22:6"
        }
        
        # Check if we have a match
        for key, value in fa_common_names.items():
            if key in fa_name:
                carbon_count, double_bond_count = value.split(":")
                name_variations = [
                    f"C{carbon_count}:{double_bond_count}",
                    f"FA {carbon_count}:{double_bond_count}",
                    f"Fatty acid {carbon_count}:{double_bond_count}",
                    name
                ]
                return list(set(name_variations))

    # Handle acetyl forms of amino acids
    acetyl_match = re.match(r'^([NO])-Acetyl([A-Za-z]+)$', name)
    if acetyl_match:
        prefix, amino_acid = acetyl_match.groups()
        name_variations = [
            name, f"{prefix}-Acetyl-{amino_acid}", f"{prefix} Acetyl {amino_acid}",
            f"Acetyl{amino_acid}", f"Acetyl-{amino_acid}", amino_acid,
            f"N-acetylated {amino_acid}", f"O-acetylated {amino_acid}"
        ]
        return list(set(name_variations))

    # If it doesn't match any pattern, return the original name and some basic variations
    basic_variations = normalize_metabolite_name(name)
    return basic_variations + [name]

def normalize_peptide(name):
    """
    Normalize peptide names and generate relevant variations
    """
    if not isinstance(name, str):
        return []
        
    # Check if this is a tryptic peptide
    if "(Tryptic Peptide)" in name:
        # Extract just the peptide sequence
        sequence = name.replace(" (Tryptic Peptide)", "").strip()
        
        # Generate variations
        variations = [
            sequence, f"{sequence} peptide", f"peptide {sequence}",
            "tryptic peptide", "peptide fragment"
        ]
        
        # If it's an "or" pattern (multiple possible peptides)
        if " or " in sequence:
            peptides = sequence.split(" or ")
            variations.extend(peptides)
        
        return list(set(variations))
    
    # Check if this is just a peptide sequence (all uppercase letters)
    if re.match(r'^[A-Z]+$', name):
        return [
            name, f"{name} peptide", f"peptide {name}", "peptide fragment"
        ]
    
    return []

def determine_chemical_class(name):
    """
    Determine the broad chemical class of a metabolite to help mapping
    """
    name_lower = name.lower()
    
    # Dictionary of chemical class identifiers and their associated pathway prefixes
    class_markers = {
        # Amino acids and derivatives
        "amino acid": ["Amino acid metabolism"], "alanine": ["Amino acid metabolism"],
        "glycine": ["Amino acid metabolism"], "serine": ["Amino acid metabolism"],
        "cysteine": ["Amino acid metabolism"], "threonine": ["Amino acid metabolism"],
        "methionine": ["Amino acid metabolism"], "aspart": ["Amino acid metabolism"],
        "glutam": ["Amino acid metabolism"], "lysine": ["Amino acid metabolism"],
        "arginine": ["Amino acid metabolism"], "histidine": ["Amino acid metabolism"],
        "tryptophan": ["Amino acid metabolism"], "phenylalan": ["Amino acid metabolism"],
        "tyrosine": ["Amino acid metabolism"], "valine": ["Amino acid metabolism"],
        "leucine": ["Amino acid metabolism"], "isoleucine": ["Amino acid metabolism"],
        # Nucleotide metabolism
        "adenosine": ["Nucleotide metabolism", "Purine metabolism"], "guanosine": ["Nucleotide metabolism", "Purine metabolism"],
        "cytidine": ["Nucleotide metabolism", "Pyrimidine metabolism"], "uridine": ["Nucleotide metabolism", "Pyrimidine metabolism"],
        # Carbohydrate metabolism
        "glucose": ["Carbohydrate metabolism"], "fructose": ["Carbohydrate metabolism"],
        # Lipid metabolism
        "lipid": ["Lipid metabolism"], "sterol": ["Steroid metabolism"], "cholesterol": ["Steroid metabolism"],
        "bile": ["Bile acid metabolism"], "sphingo": ["Sphingolipid metabolism"], "ceramide": ["Sphingolipid metabolism"],
        "phosphatidyl": ["Glycerophospholipid metabolism"], "fatty acid": ["Fatty acid metabolism"],
        # Xenobiotics & drugs
        "drug": ["Drug metabolism"], "xenobiotic": ["Xenobiotic metabolism"],
        # Vitamins and cofactors
        "vitamin": ["Vitamin metabolism"], "thiamine": ["Vitamin B1 metabolism"], "riboflavin": ["Vitamin B2 metabolism"],
        "niacin": ["Vitamin B3 metabolism"], "folate": ["Vitamin B9 metabolism"], "retinol": ["Vitamin A metabolism"],
        "coenzyme": ["Cofactor metabolism"],
        # Peptides
        "peptide": ["Peptide metabolism"], "tryptic": ["Peptide metabolism"]
    }
    
    pathways = []
    for marker, associated_pathways in class_markers.items():
        if marker in name_lower:
            pathways.extend(associated_pathways)
    
    return list(set(pathways))

# =====================================
# API Request Handler Classes
# =====================================

class RateLimitedAPI:
    """Base class for making rate-limited API requests with caching"""
    def __init__(self, name, requests_per_second=1, timeout=15, max_retries=3):
        self.name = name
        self.request_interval = 1.0 / requests_per_second
        self.last_request_time = 0
        self.cache = {}
        self.lock = threading.Lock()
        self.timeout = timeout
        self.max_retries = max_retries
    
    def _wait_for_rate_limit(self):
        with self.lock:
            current_time = time.time()
            time_since_last_request = current_time - self.last_request_time
            if time_since_last_request < self.request_interval:
                sleep_time = self.request_interval - time_since_last_request
                time.sleep(sleep_time)
            self.last_request_time = time.time()
    
    def get(self, url, headers=None, cache_key=None, retry_count=0):
        if cache_key is None:
            cache_key = url
        if cache_key in self.cache:
            return self.cache[cache_key]
        self._wait_for_rate_limit()
        try:
            logger.debug(f"{self.name} API request: {url}")
            response = session.get(url, headers=headers, timeout=self.timeout)
            if response.ok:
                self.cache[cache_key] = response
            else:
                logger.warning(f"{self.name} API returned status code {response.status_code} for URL {url}")
                if response.status_code >= 500 and retry_count < self.max_retries:
                    retry_delay = 2 ** retry_count
                    logger.warning(f"Will retry in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    return self.get(url, headers, cache_key, retry_count + 1)
            return response
        except requests.exceptions.Timeout:
            logger.warning(f"{self.name} API request timed out for URL {url}")
            if retry_count < self.max_retries:
                retry_delay = 2 ** retry_count
                logger.warning(f"Request timed out, retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                return self.get(url, headers, cache_key, retry_count + 1)
            raise
        except requests.exceptions.RequestException as e:
            logger.warning(f"{self.name} API request error: {str(e)} for URL {url}")
            if retry_count < self.max_retries:
                retry_delay = 2 ** retry_count
                logger.warning(f"{str(e)}, retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                return self.get(url, headers, cache_key, retry_count + 1)
            raise

# =====================================
# Database-Specific API Functions
# =====================================

# Initialize rate-limited API clients
kegg_api = RateLimitedAPI("KEGG", requests_per_second=1, timeout=20)
pubchem_api = RateLimitedAPI("PubChem", requests_per_second=2, timeout=25)
hmdb_api = RateLimitedAPI("HMDB", requests_per_second=0.5, timeout=25)
lipid_maps_api = RateLimitedAPI("LIPID_MAPS", requests_per_second=0.5, timeout=25)

def get_kegg_id(name):
    try:
        url = f"http://rest.kegg.jp/find/compound/{quote(name)}"
        response = kegg_api.get(url, cache_key=f"kegg_id_{name}")
        if response and response.ok and response.text.strip():
            first_line = response.text.strip().split('\n')[0]
            kegg_id = first_line.split('\t')[0]
            return kegg_id
    except Exception as e:
        logger.warning(f"Error in get_kegg_id for {name}: {str(e)}")
    return None

def get_pathways(kegg_id):
    try:
        if not kegg_id: return []
        url = f"http://rest.kegg.jp/link/pathway/{kegg_id}"
        response = kegg_api.get(url, cache_key=f"pathways_{kegg_id}")
        if not response or not response.ok or not response.text.strip(): return []
        lines = response.text.strip().split('\n')
        pathways = [parts[1] for line in lines if line and len(parts := line.split('\t')) == 2]
        return pathways
    except Exception as e:
        logger.warning(f"Error in get_pathways for {kegg_id}: {str(e)}")
        return []

def get_pathway_name(pid):
    try:
        if not pid: return ""
        url = f"http://rest.kegg.jp/list/{pid}"
        response = kegg_api.get(url, cache_key=f"pathway_name_{pid}")
        if response and response.ok and response.text.strip():
            return response.text.strip().split('\t')[1]
    except Exception as e:
        logger.warning(f"Error in get_pathway_name for {pid}: {str(e)}")
    return ""

def search_pubchem(name):
    try:
        search_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(name)}/cids/JSON"
        response = pubchem_api.get(search_url, cache_key=f"pubchem_{name}")
        if not response or not response.ok: return None, []
        data = response.json()
        if "IdentifierList" in data and "CID" in data["IdentifierList"]:
            cid = data["IdentifierList"]["CID"][0]
            classification_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/classification/JSON"
            class_response = pubchem_api.get(classification_url, cache_key=f"pubchem_class_{cid}")
            if class_response and class_response.ok:
                class_data = class_response.json()
                pathways = []
                if "Classification" in class_data and "Hierarchies" in class_data["Classification"]:
                    for hierarchy in class_data["Classification"]["Hierarchies"]:
                        if "Node" in hierarchy:
                            for node in hierarchy["Node"]:
                                if "Information" in node and "Description" in node["Information"]:
                                    desc = node["Information"]["Description"]
                                    if any(kw in desc.lower() for kw in ["metabol", "pathway", "lipid"]):
                                        pathways.append(desc)
                return cid, list(set(pathways))
            return cid, []
    except Exception as e:
        logger.warning(f"Error in search_pubchem for {name}: {str(e)}")
    return None, []

def query_hmdb(name):
    try:
        if "(Tryptic Peptide)" in name: return ["Protein metabolism", "Peptide degradation"]
        special_cases = {"adenosine": ["Purine metabolism"], "cytidine": ["Pyrimidine metabolism"]}
        for key, pathways_list in special_cases.items():
            if key.lower() in name.lower(): return pathways_list
        encoded_name = quote(name)
        url = f"http://www.hmdb.ca/unearth/q?query={encoded_name}&searcher=metabolites"
        response = hmdb_api.get(url, cache_key=f"hmdb_{name}")
        if not response or not response.ok: return []
        hmdb_ids = re.findall(r'href="/metabolites/(HMDB\d+)"', response.text)
        if not hmdb_ids: return []
        metabolite_url = f"http://www.hmdb.ca/metabolites/{hmdb_ids[0]}"
        metabolite_response = hmdb_api.get(metabolite_url, cache_key=f"hmdb_metabolite_{hmdb_ids[0]}")
        if not metabolite_response or not metabolite_response.ok: return []
        pathway_patterns = [r'<td>Pathway</td>\s*<td>(.*?)</td>', r'<td>Class</td>\s*<td>(.*?)</td>']
        pathways = []
        for pattern in pathway_patterns:
            found_items = re.findall(pattern, metabolite_response.text, re.DOTALL)
            for item_html in found_items:
                clean_text = re.sub(r'<.*?>', ' ', item_html).strip()
                pathways.extend([p.strip() for p in re.split(r'[,;]', clean_text) if p.strip()])
        return list(set(pathways))
    except Exception as e:
        logger.warning(f"Error in query_hmdb for {name}: {str(e)}")
        return []

def query_lipid_maps(name):
    try:
        class_mappings = { "Fatty Acids": ["Fatty acid metabolism"], "Glycerophospholipids": ["Glycerophospholipid metabolism"], "Sphingolipids": ["Sphingolipid metabolism"], "Ceramides": ["Ceramide metabolism"] }
        class_indicators = { "PC": "Glycerophospholipids", "PE": "Glycerophospholipids", "SM": "Sphingolipids", "Cer": "Ceramides", "FA": "Fatty Acids"}
        pathway_info = []
        for indicator, lipid_class in class_indicators.items():
            if indicator in name and lipid_class in class_mappings:
                pathway_info.extend(class_mappings[lipid_class])
        return list(set(pathway_info))
    except Exception as e:
        logger.warning(f"Error in query_lipid_maps for {name}: {str(e)}")
        return []

# =====================================
# Metabolite Classification and Pathway Mapping
# =====================================

def is_lipid_metabolite(name):
    if not isinstance(name, str): return False
    lipid_indicators = ['pc', 'pe', 'ps', 'pi', 'pg', 'pa', 'dag', 'tag', 'sm', 'cer', 'lipid', 'cholesterol', 'fatty acid']
    name_lower = name.lower()
    if re.search(r'[a-z]+\s?\d+:\d+', name_lower): return True
    for indicator in lipid_indicators:
        if indicator in name_lower: return True
    return False

def get_metabolite_pathways(name, is_lipid=False):
    result = { "metabolite_name": name, "pathways_by_source": defaultdict(list), "combined_pathways": set() }
    if not isinstance(name, str) or not name.strip(): return result
    
    name_variations = normalize_lipid_name(name) if is_lipid else normalize_metabolite_name(name)
    name_variations = name_variations[:10]  # Limit variations
    
    if "(Tryptic Peptide)" in name:
        peptide_pathways = ["Protein metabolism", "Peptide degradation"]
        result["pathways_by_source"]["Inferred"] = peptide_pathways
        result["combined_pathways"].update(peptide_pathways)
        result["combined_pathways"] = list(result["combined_pathways"])
        return result

    # Try KEGG
    try:
        for variation in name_variations[:3]:
            kegg_id = get_kegg_id(variation)
            if kegg_id:
                pathways = get_pathways(kegg_id)
                if pathways:
                    kegg_pathway_names = [get_pathway_name(pid) for pid in pathways if get_pathway_name(pid)]
                    if kegg_pathway_names:
                        result["pathways_by_source"]["KEGG"] = kegg_pathway_names
                        result["combined_pathways"].update(kegg_pathway_names)
                        break
    except Exception as e:
        logger.warning(f"Error in KEGG search for {name}: {e}")

    # Try LIPID MAPS for lipids if KEGG fails
    if is_lipid and not result["pathways_by_source"].get("KEGG"):
        try:
            for variation in name_variations[:3]:
                lipid_pathways = query_lipid_maps(variation)
                if lipid_pathways:
                    result["pathways_by_source"]["LIPID_MAPS"] = lipid_pathways
                    result["combined_pathways"].update(lipid_pathways)
                    break
        except Exception as e:
            logger.warning(f"Error in LIPID MAPS search for {name}: {e}")
    
    # Try HMDB if still not much found
    if len(result["combined_pathways"]) < 2:
        try:
            for variation in name_variations[:2]:
                hmdb_pathways = query_hmdb(variation)
                if hmdb_pathways:
                    result["pathways_by_source"]["HMDB"] = hmdb_pathways
                    result["combined_pathways"].update(hmdb_pathways)
                    break
        except Exception as e:
            logger.warning(f"Error in HMDB search for {name}: {e}")

    # Fallback to chemical class
    if not result["combined_pathways"]:
        class_pathways = determine_chemical_class(name)
        if class_pathways:
            result["pathways_by_source"]["ChemicalClass"] = class_pathways
            result["combined_pathways"].update(class_pathways)
            
    result["combined_pathways"] = list(result["combined_pathways"])
    result["pathways_by_source"] = {k: v for k, v in result["pathways_by_source"].items() if v}
    return result

# =====================================
# Main Processing Function (Modified)
# =====================================

def process_metabolomics_data(file_path, output_path, metabolite_col=None, max_workers=3):
    """
    Process metabolomics data file, identify pathways, and save results.
    """
    logger.info(f"Loading data from {file_path}...")
    
    try:
        df = pd.read_excel(file_path)
        
        # Determine the metabolite column
        required_column = None
        if metabolite_col and metabolite_col in df.columns:
            required_column = metabolite_col
            logger.info(f"Using user-specified metabolite column: '{required_column}'")
        else:
            logger.info("`metabolite_column_name` not specified, attempting to auto-detect...")
            for col in df.columns:
                if any(keyword in col.lower() for keyword in ["name", "metabolite", "compound"]):
                    required_column = col
                    logger.info(f"Auto-detected metabolite column: '{required_column}'")
                    break
        
        if required_column is None:
            logger.error("Could not find a column containing metabolite names. Please specify it in Cell 1 using the `metabolite_column_name` variable.")
            return

        logger.info(f"Processing {len(df)} metabolites...")
        
        pathway_to_metabolites = defaultdict(list)
        mapped_metabolites = set()
        metabolite_names = df[required_column].dropna().astype(str).tolist()
        database_contributions = defaultdict(set)

        for idx, metabolite_name in enumerate(metabolite_names):
            if not metabolite_name.strip():
                continue
            
            logger.info(f"Processing metabolite {idx + 1}/{len(metabolite_names)}: {metabolite_name}")
            
            try:
                is_lipid = is_lipid_metabolite(metabolite_name)
                pathway_info = get_metabolite_pathways(metabolite_name, is_lipid=is_lipid)
                
                if pathway_info["combined_pathways"]:
                    mapped_metabolites.add(metabolite_name)
                    for db_name, pathways in pathway_info["pathways_by_source"].items():
                        if pathways:
                            database_contributions[db_name].add(metabolite_name)
                    for pathway in pathway_info["combined_pathways"]:
                        pathway_to_metabolites[pathway].append(metabolite_name)
                    logger.info(f"  -> Found {len(pathway_info['combined_pathways'])} pathways.")
                else:
                    logger.info(f"  -> No pathways found for {metabolite_name}")
            except Exception as e:
                logger.error(f"Error processing metabolite {metabolite_name}: {e}")
                logger.error(traceback.format_exc())
                
        # Create summary dataframes
        pathway_summary = [{'Pathway': p, 'Metabolite_Count': len(m), 'Metabolites': ', '.join(sorted(list(set(m))))} for p, m in pathway_to_metabolites.items()]
        pathway_df = pd.DataFrame(pathway_summary).sort_values('Metabolite_Count', ascending=False).reset_index(drop=True)
        
        unmapped = [m for m in metabolite_names if m not in mapped_metabolites]
        unmapped_df = pd.DataFrame({'Unmapped_Metabolite': unmapped})
        
        db_contrib_summary = [{'Database': db, 'Metabolites_Mapped': len(mets), 'Percentage': (len(mets) / len(mapped_metabolites) * 100) if mapped_metabolites else 0} for db, mets in database_contributions.items()]
        db_contrib_df = pd.DataFrame(db_contrib_summary).sort_values('Metabolites_Mapped', ascending=False).reset_index(drop=True)

        # Save results to Excel
        logger.info(f"Saving results to {output_path}...")
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            pathway_df.to_excel(writer, sheet_name='Pathway_Summary', index=False)
            db_contrib_df.to_excel(writer, sheet_name='Database_Contributions', index=False)
            unmapped_df.to_excel(writer, sheet_name='Unmapped_Metabolites', index=False)
        
        logger.info("Analysis complete!")
        return {
            'total_metabolites': len(metabolite_names),
            'mapped_metabolites': len(mapped_metabolites),
            'unmapped_metabolites': len(unmapped),
            'total_pathways_found': len(pathway_df),
            'database_contributions': {db: len(mets) for db, mets in database_contributions.items()}
        }

    except FileNotFoundError:
        logger.error(f"Input file not found at: {file_path}. Please check the path in Cell 1.")
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        logger.error(traceback.format_exc())
        return None

# =====================================
# Main Execution
# =====================================

# This block uses the variables defined in Cell 1
if 'input_file_path' in locals() and 'timestamped_output_path' in locals():
    print(f"Starting analysis with input file: {input_file_path}")
    
    summary = process_metabolomics_data(
        file_path=input_file_path,
        output_path=timestamped_output_path,
        metabolite_col=metabolite_column_name
    )
    
    if summary:
        print("\n--- Analysis Summary ---")
        print(f"Total metabolites processed: {summary['total_metabolites']}")
        if summary['total_metabolites'] > 0:
            map_perc = (summary['mapped_metabolites'] / summary['total_metabolites']) * 100
            print(f"Metabolites mapped to pathways: {summary['mapped_metabolites']} ({map_perc:.1f}%)")
        print(f"Metabolites without pathway information: {summary['unmapped_metabolites']}")
        print(f"Total unique pathways found: {summary['total_pathways_found']}")
        
        if summary['database_contributions']:
            print("\nDatabase Contribution Summary:")
            for db, count in sorted(summary['database_contributions'].items(), key=lambda x: x[1], reverse=True):
                 if summary['mapped_metabolites'] > 0:
                    db_perc = (count / summary['mapped_metabolites']) * 100
                    print(f"  - {db}: {count} metabolites ({db_perc:.1f}%)")
                 else:
                    print(f"  - {db}: {count} metabolites")

    print(f"\nProcessing complete! Check the output file for results:\n{timestamped_output_path}")

else:
    print("Configuration variables not found. Please run Cell 1 before running this cell.")

2025-08-06 11:06:06,542 - INFO - Loading data from /users/aranpurdy/desktop/cfps/pca/rf/MOD_RF_Imputed.xlsx...


Starting analysis with input file: /users/aranpurdy/desktop/cfps/pca/rf/MOD_RF_Imputed.xlsx


2025-08-06 11:06:08,922 - INFO - `metabolite_column_name` not specified, attempting to auto-detect...
2025-08-06 11:06:08,923 - INFO - Auto-detected metabolite column: 'Name'
2025-08-06 11:06:08,923 - INFO - Processing 115 metabolites...
2025-08-06 11:06:08,926 - INFO - Processing metabolite 1/115: 1,2-Dipalmitoyl-sn-glycero-3-phosphoethanolamine
2025-08-06 11:06:18,397 - INFO -   -> Found 3 pathways.
2025-08-06 11:06:18,400 - INFO - Processing metabolite 2/115: 1,2-Dipentadecanoyl-sn-glycero-3-phosphoethanolamine
2025-08-06 11:06:19,018 - INFO -   -> Found 3 pathways.
2025-08-06 11:06:19,019 - INFO - Processing metabolite 3/115: 1-Hydroxypyrene
2025-08-06 11:06:24,234 - INFO -   -> Found 2 pathways.
2025-08-06 11:06:24,236 - INFO - Processing metabolite 4/115: 1-Palmitoyl-2-oleoyl-sn-glycero-3-phosphoethanolamine
2025-08-06 11:06:33,147 - INFO -   -> No pathways found for 1-Palmitoyl-2-oleoyl-sn-glycero-3-phosphoethanolamine
2025-08-06 11:06:33,149 - INFO - Processing metabolite 5/115


--- Analysis Summary ---
Total metabolites processed: 115
Metabolites mapped to pathways: 95 (82.6%)
Metabolites without pathway information: 20
Total unique pathways found: 192

Database Contribution Summary:
  - KEGG: 70 metabolites (73.7%)
  - Inferred: 13 metabolites (13.7%)
  - ChemicalClass: 7 metabolites (7.4%)
  - LIPID_MAPS: 5 metabolites (5.3%)

Processing complete! Check the output file for results:
/users/aranpurdy/desktop/TEST/TEST_Pathways_20250806_110555.xlsx
