In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, MolSurf, GraphDescriptors, Crippen
from rdkit.Chem import rdMolDescriptors, AllChem, QED
import os
import re
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import Fragments

def detail_glycans(glycans_df, structure_col=None, name_col=None, output_file=None):
    """
    Process a dataframe containing glycan structures and add detailed molecular descriptors.
    
    Parameters:
    -----------
    glycans_df : pandas.DataFrame
        DataFrame containing glycan structures
    structure_col : str, optional
        Column name containing the glycan structure (SMILES, IUPAC, etc.)
        If None, will attempt to auto-detect based on common column names
    name_col : str, optional
        Column name containing the glycan name or ID
        If None, will attempt to auto-detect based on common column names
    output_file : str, optional
        If provided, save the detailed dataframe to this file path
        
    Returns:
    --------
    pandas.DataFrame
        Original dataframe with additional glycan descriptor columns
    """
    # Make a copy to avoid modifying the original
    df = glycans_df.copy()
    
    # Try to auto-detect the structure column if not provided
    if structure_col is None:
        potential_struct_cols = ['Structure', 'SMILES', 'Structure_SMILES', 'GlycanStructure', 
                                'Sequence', 'GlycanSequence', 'LinearStructure', 'IUPAC']
        for col in potential_struct_cols:
            if col in df.columns:
                structure_col = col
                break
        
        # If still not found, use the first column that might contain SMILES-like strings
        if structure_col is None:
            for col in df.columns:
                # Check a sample of values to see if they look like glycan structures
                sample = df[col].dropna().astype(str).iloc[:5].tolist()
                if any(['(' in s or 'O' in s or 'C' in s for s in sample]):
                    structure_col = col
                    break
    
    # If we still couldn't find a structure column, raise an error
    if structure_col is None:
        raise ValueError("Could not identify a column containing glycan structures. Please specify using structure_col parameter.")
    
    # Try to auto-detect the name column if not provided
    if name_col is None:
        potential_name_cols = ['Name', 'ID', 'GlycanName', 'GlycanID', 'Glycan_Name', 'Glycan_ID']
        for col in potential_name_cols:
            if col in df.columns:
                name_col = col
                break
    
    # If no name column, create one with indices
    if name_col is None:
        df['GlycanID'] = [f"Glycan_{i}" for i in range(len(df))]
        name_col = 'GlycanID'
    
    print(f"Using '{structure_col}' as the structure column and '{name_col}' as the name column")
    
    # Convert IUPAC or other glycan notations to SMILES if needed
    df['SMILES'] = df[structure_col].apply(convert_to_smiles)
    
    # Add detailed descriptors
    detailed_data = []
    
    for idx, row in df.iterrows():
        try:
            glycan_name = row[name_col]
            smiles = row['SMILES']
            
            if pd.isna(smiles) or smiles == '':
                print(f"Warning: Empty structure for {glycan_name}, skipping")
                detailed_data.append({name_col: glycan_name})
                continue
                
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                print(f"Warning: Could not parse SMILES for {glycan_name}: {smiles}")
                detailed_data.append({name_col: glycan_name})
                continue
                
            # Calculate descriptors
            descriptors = calculate_descriptors(mol)
            if descriptors is None:
                print(f"Warning: Could not calculate descriptors for {glycan_name}")
                detailed_data.append({name_col: glycan_name})
                continue
                
            # Add name and SMILES to descriptors
            descriptors[name_col] = glycan_name
            
            # Add glycan-specific features
            add_glycan_specific_features(descriptors, mol, smiles)
                
            # Add to data
            detailed_data.append(descriptors)
            
        except Exception as e:
            print(f"Error processing {idx}: {e}")
            detailed_data.append({name_col: row.get(name_col, f"Row_{idx}")})
    
    # Convert to DataFrame
    detailed_df = pd.DataFrame(detailed_data)
    
    # Merge with original data to preserve any columns we didn't process
    if name_col in detailed_df.columns:
        result = pd.merge(df, detailed_df, on=name_col, how='left')
    else:
        # If we somehow don't have the name column in the detailed data, just return it as is
        result = detailed_df
    
    # Save to file if requested
    if output_file is not None:
        result.to_csv(output_file, index=False)
        print(f"Saved detailed dataset to {output_file}")
    
    return result

def convert_to_smiles(structure):
    """
    Convert various glycan structure formats to SMILES.
    This is a placeholder - in production, you would implement proper conversion 
    from IUPAC or other glycan notation systems to SMILES.
    """
    if pd.isna(structure):
        return ''
    
    structure = str(structure)
    
    # If it already looks like a SMILES string, return it
    if 'C' in structure and 'O' in structure and any(c in structure for c in '()='):
        return structure
    
    # For this simplified example, we'll map some common glycan names to SMILES
    # In a real implementation, you would use proper conversion libraries
    glycan_map = {
        "Glucose": "OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "Glc": "OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "Galactose": "OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "Gal": "OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "Mannose": "OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "Man": "OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "GlcNAc": "CC(=O)N[C@@H]1C(O)O[C@H](CO)[C@@H](O)[C@@H]1O",
        "N-Acetylglucosamine": "CC(=O)N[C@@H]1C(O)O[C@H](CO)[C@@H](O)[C@@H]1O",
        "Fuc": "CC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O",
        "Fucose": "CC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O"
    }
    
    # Check for exact matches
    if structure in glycan_map:
        return glycan_map[structure]
    
    # For strings that might contain glycan codes, try to replace them
    # This is very simplified - real implementation would need proper parsing
    result = structure
    for key, smiles in glycan_map.items():
        if key in structure:
            result = smiles
            break
    
    return result

def calculate_descriptors(mol):
    """
    Calculate molecular descriptors for a molecule.
    
    Parameters:
    -----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule object
    
    Returns:
    --------
    dict
        Dictionary of calculated descriptors
    """
    if mol is None:
        return None
    
    # Ensure we have a 3D conformation for certain descriptors
    if mol.GetNumConformers() == 0:
        try:
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol)
            AllChem.UFFOptimizeMolecule(mol)
        except:
            # If 3D embedding fails, we'll just continue without it
            pass
    
    descriptors = {}
    
    # Basic properties
    descriptors["MolecularWeight"] = Descriptors.MolWt(mol)
    descriptors["HeavyAtomCount"] = Descriptors.HeavyAtomCount(mol)
    descriptors["RingCount"] = Descriptors.RingCount(mol)
    descriptors["RotatableBondCount"] = Descriptors.NumRotatableBonds(mol)
    descriptors["HBondDonorCount"] = Descriptors.NumHDonors(mol)
    descriptors["HBondAcceptorCount"] = Descriptors.NumHAcceptors(mol)
    descriptors["TPSA"] = Descriptors.TPSA(mol)
    
    # Lipinski properties
    descriptors["LogP"] = Descriptors.MolLogP(mol)
    descriptors["MolMR"] = Descriptors.MolMR(mol)
    descriptors["FractionCSP3"] = Descriptors.FractionCSP3(mol)
    
    # Topological properties
    try:
        descriptors["BertzCT"] = Descriptors.BertzCT(mol)
        descriptors["Chi0v"] = Descriptors.Chi0v(mol)
        descriptors["Chi1v"] = Descriptors.Chi1v(mol)
        descriptors["Chi2v"] = Descriptors.Chi2v(mol)
        descriptors["Chi3v"] = Descriptors.Chi3v(mol)
        descriptors["Chi4v"] = Descriptors.Chi4v(mol)
        descriptors["HallKierAlpha"] = Descriptors.HallKierAlpha(mol)
        descriptors["Kappa1"] = Descriptors.Kappa1(mol)
        descriptors["Kappa2"] = Descriptors.Kappa2(mol)
        descriptors["Kappa3"] = Descriptors.Kappa3(mol)
    except:
        # Some molecules might cause issues with certain descriptors
        pass
    
    # Surface area and volume descriptors
    try:
        descriptors["LabuteASA"] = Descriptors.LabuteASA(mol)
    except:
        pass
    
    # 3D descriptors (if 3D conformation is available)
    if mol.GetNumConformers() > 0:
        try:
            descriptors["PMI1"] = rdMolDescriptors.CalcPMI1(mol)
            descriptors["PMI2"] = rdMolDescriptors.CalcPMI2(mol)
            descriptors["PMI3"] = rdMolDescriptors.CalcPMI3(mol)
            descriptors["NPR1"] = rdMolDescriptors.CalcNPR1(mol)
            descriptors["NPR2"] = rdMolDescriptors.CalcNPR2(mol)
            descriptors["RadiusOfGyration"] = rdMolDescriptors.CalcRadiusOfGyration(mol)
            descriptors["InertialShapeFactor"] = rdMolDescriptors.CalcInertialShapeFactor(mol)
            descriptors["Eccentricity"] = rdMolDescriptors.CalcEccentricity(mol)
            descriptors["Asphericity"] = rdMolDescriptors.CalcAsphericity(mol)
            descriptors["SpherocityIndex"] = rdMolDescriptors.CalcSpherocityIndex(mol)
        except:
            # Some 3D descriptors might fail for certain molecules
            pass
    
    # VSA descriptors
    try:
        for i in range(1, 11):
            descriptors[f"SlogP_VSA{i}"] = getattr(MolSurf, f"SlogP_VSA{i}")(mol)
            
        for i in range(1, 12):
            descriptors[f"SMR_VSA{i}"] = getattr(MolSurf, f"SMR_VSA{i}")(mol)
            
        for i in range(1, 14):
            descriptors[f"PEOE_VSA{i}"] = getattr(MolSurf, f"PEOE_VSA{i}")(mol)
    except:
        # VSA descriptors might fail for some molecules
        pass
    
    return descriptors

def add_glycan_specific_features(descriptors, mol, smiles):
    """
    Add glycan-specific features to the descriptors dictionary.
    
    Parameters:
    -----------
    descriptors : dict
        Dictionary of descriptors to add to
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule object
    smiles : str
        SMILES string of the glycan
    """
    
    # Try to count features typical for glycans using SMARTS patterns
    try:
        # Ring oxygen count (important in glycans as they form the glycosidic bond)
        ring_o_pattern = Chem.MolFromSmarts('[OR1]')
        if ring_o_pattern:
            descriptors["RingOCount"] = len(mol.GetSubstructMatches(ring_o_pattern))
        
        # Pyranose ring count (6-membered sugar ring)
        pyranose_pattern = Chem.MolFromSmarts('[CR1]1[OR1][CR0][CR0][CR0][CR0]1')
        if pyranose_pattern:
            descriptors["PyranoseCount"] = len(mol.GetSubstructMatches(pyranose_pattern))
        
        # Furanose ring count (5-membered sugar ring)
        furanose_pattern = Chem.MolFromSmarts('[CR1]1[OR1][CR0][CR0][CR0]1')
        if furanose_pattern:
            descriptors["FuranoseCount"] = len(mol.GetSubstructMatches(furanose_pattern))
        
        # Branching points (carbons connected to 2 or more oxygens)
        branch_pattern = Chem.MolFromSmarts('[CR0]([OR0])([OR0])')
        if branch_pattern:
            descriptors["BranchingPoints"] = len(mol.GetSubstructMatches(branch_pattern))
        
        # Glycosidic linkages
        glycosidic_pattern = Chem.MolFromSmarts('[OR0]([CR1])([CR1])')
        if glycosidic_pattern:
            descriptors["GlycosidicLinkages"] = len(mol.GetSubstructMatches(glycosidic_pattern))
        
        # Amino groups (found in amino sugars like GlcNAc)
        amino_pattern = Chem.MolFromSmarts('[NR0]')
        if amino_pattern:
            descriptors["AminoGroups"] = len(mol.GetSubstructMatches(amino_pattern))
        
        # Acetyl groups (found in N-acetylated sugars)
        acetyl_pattern = Chem.MolFromSmarts('CC(=O)[NR0]')
        if acetyl_pattern:
            descriptors["AcetylGroups"] = len(mol.GetSubstructMatches(acetyl_pattern))
        
        # Carboxyl groups (found in sialic acids)
        carboxyl_pattern = Chem.MolFromSmarts('C(=O)[OH]')
        if carboxyl_pattern:
            descriptors["CarboxylGroups"] = len(mol.GetSubstructMatches(carboxyl_pattern))
        
        # Methyl groups (found in deoxy sugars like fucose)
        methyl_pattern = Chem.MolFromSmarts('[CR0][CH3]')
        if methyl_pattern:
            descriptors["MethylGroups"] = len(mol.GetSubstructMatches(methyl_pattern))
        
    except Exception as e:
        print(f"Error calculating glycan-specific features: {e}")
    
    # Try to determine glycan class based on features
    try:
        if descriptors.get("PyranoseCount", 0) + descriptors.get("FuranoseCount", 0) == 1:
            descriptors["GlycanClass"] = "Monosaccharide"
        elif descriptors.get("PyranoseCount", 0) + descriptors.get("FuranoseCount", 0) == 2:
            descriptors["GlycanClass"] = "Disaccharide"
        elif descriptors.get("PyranoseCount", 0) + descriptors.get("FuranoseCount", 0) > 2:
            if descriptors.get("BranchingPoints", 0) > 0:
                descriptors["GlycanClass"] = "Branched Oligosaccharide"
            else:
                descriptors["GlycanClass"] = "Linear Oligosaccharide"
        else:
            descriptors["GlycanClass"] = "Unknown"
            
        # Add structure complexity score (simple heuristic)
        complexity = (
            (descriptors.get("PyranoseCount", 0) + descriptors.get("FuranoseCount", 0)) * 2 +
            descriptors.get("BranchingPoints", 0) * 3 +
            descriptors.get("GlycosidicLinkages", 0) +
            descriptors.get("AminoGroups", 0) +
            descriptors.get("AcetylGroups", 0) * 2 +
            descriptors.get("CarboxylGroups", 0) * 2
        )
        descriptors["ComplexityScore"] = complexity
        
    except Exception as e:
        print(f"Error determining glycan class: {e}")

    return descriptors

# Example usage:
if __name__ == "__main__":
    # This is a simple example of how to use the function
    # Create a sample dataframe with glycan structures
    sample_df = pd.DataFrame({
        'GlycanID': ['Glc1', 'Gal1', 'Complex1'],
        'Structure': ['Glucose', 'Galactose', 'Glc-Gal']
    })
    
    # Process the sample dataframe
    detailed_df = detail_glycans(sample_df, output_file="detailed_glycans.csv")
    
    # Print the columns in the new dataframe
    print(f"Original columns: {sample_df.columns.tolist()}")
    print(f"New columns: {detailed_df.columns.tolist()}")
    print(f"Added {len(detailed_df.columns) - len(sample_df.columns)} new columns")

Using 'Structure' as the structure column and 'GlycanID' as the name column
Saved detailed dataset to detailed_glycans.csv
Original columns: ['GlycanID', 'Structure']
New columns: ['GlycanID', 'Structure', 'SMILES', 'MolecularWeight', 'HeavyAtomCount', 'RingCount', 'RotatableBondCount', 'HBondDonorCount', 'HBondAcceptorCount', 'TPSA', 'LogP', 'MolMR', 'FractionCSP3', 'BertzCT', 'Chi0v', 'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v', 'HallKierAlpha', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PMI1', 'PMI2', 'PMI3', 'NPR1', 'NPR2', 'RadiusOfGyration', 'InertialShapeFactor', 'Eccentricity', 'Asphericity', 'SpherocityIndex', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'SlogP_VSA10', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SMR_VSA10', 'RingOCount', 'PyranoseCount', 'FuranoseCount', 'BranchingPoints', 'GlycosidicLinkages', 'AminoGroups', 'AcetylGroups',

In [None]:
# Read your glycan structures file
glycans = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep="\s") 

# Add detailed molecular descriptors
glycans_detailed = detail_glycans(glycans)

# Save the enhanced dataset if desired
glycans_detailed.to_csv('detailed_glycans.csv', index=False)

  glycans = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep="\s")
  glycans = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep="\s")


Using 'SMILES' as the structure column and 'Name' as the name column
