# SMILES 2 Name

* Aim: Convert SMILES to [IPUAC name](http://www.chem.uiuc.edu/GenChemReferences/nomenclature_rules.html) or Common name or Formula
* Input: df (with SIMILES) 
* Output: df (with a new column Name)

# 1. Read Data

In [7]:
import pandas as pd
import pubchempy as pcp
from tqdm import tqdm
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

In [9]:
tqdm.pandas()

df = pd.read_csv('mcidxkegg_1811882_all.csv')
df = df.sample(50, random_state=42)  # random_state is optional but ensures reproducibility

(1811882, 13)

# 2. Functions

In [3]:
def smiles_to_iupac_name(smiles):
    try:
        compounds = pcp.get_compounds(smiles, namespace='smiles')
        if compounds:
            match = compounds[0]
            return match.iupac_name
    except Exception as e:
        print(f"Error with SMILES {smiles}: {e}")
    return smiles


def smiles_to_common_name(smiles):
    try:
        compounds = pcp.get_compounds(smiles, namespace='smiles')
        if compounds:
            match = compounds[0]
            # `synonyms` is a list of alternate names for the compound; the first one is usually the most common name.
            return match.synonyms[0] if match.synonyms else smiles
    except Exception as e:
        print(f"Error with SMILES {smiles}: {e}")
    return smiles

def smiles_to_formula(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.rdMolDescriptors.CalcMolFormula(mol)
    except:
        return None

In [4]:
def smiles2name(smiles):
    """
    Convert a SMILES string to its common name, IUPAC name, or molecular formula.
    If all conversions fail, return the original SMILES string.
    
    :param smiles: A SMILES representation of a molecule.
    :return: A string representing the molecule's common name, IUPAC name, formula, or original SMILES.
    """
    
    # [Stage 1: Fetch compound details from PubChem]
    try:
        compounds = pcp.get_compounds(smiles, namespace='smiles')
        if compounds:
            match = compounds[0]
            
            # [Stage 2: Extract common name]
            if match.synonyms:
                return match.synonyms[0]
            
            # [Stage 3: Extract IUPAC name if common name is absent]
            if match.iupac_name:
                return match.iupac_name
        
        # [Stage 4: Compute molecular formula if both names are absent]
        mol = Chem.MolFromSmiles(smiles)
        formula = Chem.rdMolDescriptors.CalcMolFormula(mol)
        if formula:
            return formula
        
    except Exception as e:
        print(f"Error with SMILES {smiles}: {e}")

    # [Stage 5: Fallback to original SMILES if all above stages fail]
    return smiles

# 3. Run

In [5]:
df['Name'] = df['smiles'].progress_apply(smiles2name)

 42%|██████████████████                         | 21/50 [00:51<01:13,  2.55s/it]

Error with SMILES O=C[S](C)CCCCCCCCC(=NOS(=O)(=O)O)SC1OC(CO)C(O)C(O)C1O: 'PUGREST.BadRequest: error: '


100%|███████████████████████████████████████████| 50/50 [02:00<00:00,  2.41s/it]


In [6]:
df

Unnamed: 0,mcid,exact_mass,possible_reaction,smiles,substrate_id,substrate_name,substrate_name_all,substrate_formula,substrate_exact_mass,substrate_mol_weight,kegg_reaction,kegg_pathway,kegg_module,Name
691101,C10173R20009,217.095023,R20,O=C(O)C1N2CCCC(O)C2C(O)C1O,C10173,Swainsonine,Swainsonine,C8H15NO3,173.1052,173.2096,True,True,False,"1,2,8-trihydroxy-1,2,3,5,6,7,8,8a-octahydroind..."
61856,C00751R04007,424.406902,R04,C(=C(C)C)CCC(=CCCC(=CCCC=C(C)CCC=C(CC)CCC=C(C)...,C00751,Squalene,Squalene;_Spinacene;_Supraene,C30H50,410.3913,410.718,True,True,False,C31H52
1717639,C21308R36001,288.077993,R36,O=C(NCCS(=O)(=O)O)CC(N)C1=CC=C(O)C=C1,C21308,(S)-beta-Tyrosine,(S)-beta-Tyrosine,C9H11NO3,181.0739,181.1885,True,True,True,C11H16N2O5S
1539547,C19937R16008,340.26136,R16,O=C(O)CCCCCCC=CC(OO)CC(=CCCCCC)CC,C19937,"(8E,10R,12Z)-10-Hydroperoxy-8,12-octadecadienoate","(8E,10R,12Z)-10-Hydroperoxy-8,12-octadecadienoate",C18H32O4,312.2301,312.4443,True,True,False,C20H36O4
306115,C04217R08009,534.251142,R08,O=P(O)(O)OP(=O)(O)OCC=C(C)CCC=C(C)CCC=C(C)CCC(...,C04217,all-trans-Pentaprenyl diphosphate,all-trans-Pentaprenyl diphosphate;_Geranylfarn...,C25H44O7P2,518.2562,518.5602,True,False,False,C25H44O8P2
1709656,C21264R60013,824.376342,R60,O=C1N=C(N)C=CN1C2OC(COC3C(O)C(OC(OC4C(O)C(OC4C...,C21264,5'''-epi-Lividomycin B,5'''-epi-Lividomycin B,C23H45N5O13,599.3014,599.6291,True,True,False,C32H56N8O17
696879,C10385R20032,792.605661,R20,O=C1C=C(C(=O)C(=C1C)C)CC=C(C)CCC=C(C)CC(C=C(C)...,C10385,Plastoquinone-9,Plastoquinone-9;_Plastoquinone A,C53H80O2,748.6158,749.2011,True,False,False,C54H80O4
1137115,C15777R20014,442.344695,R20,O=C(O)C1C=C2C3CCC(C(C)CCC(=C)C(C)C)C3(C)CCC2C4...,C15777,Episterol,Episterol,C28H46O,398.3549,398.6642,True,True,True,C29H46O3
940757,C13662R14006,1111.432769,R14,O=COC1=CC=C(C=C1)CC2NC(=O)C(N)CSSCC(NC(=O)C(NC...,C13662,Arg-vasopressin,Arg-vasopressin;_Argipressin;_8-L-Arginine vas...,C46H65N15O12S2,1083.4379,1084.2316,False,True,False,C47H65N15O13S2
504303,C06417R04004,131.094629,R04,O=C(O)C(NC)C(C)C,C06417,D-Valine,D-Valine;_(R)-2-Amino-3-methylbutyric acid,C5H11NO2,117.079,117.1463,False,False,False,N-Methyl-DL-valine


In [9]:
# import pandas as pd
# plant = pd.read_csv('mcidxplant_0rxn.csv')
# df = plant

In [10]:
# df['mcid'] = ['MCIDP' + str(i).zfill(6) for i in range(1, 218894)]
# column_order = ['mcid'] + [col for col in df.columns if col != 'mcid']
# df = df[column_order]
# df.to_csv('mcidxplant_0rxn.csv', index=False)

# csv2sql

In [13]:
import pandas as pd
import sqlite3

In [14]:
def csv_to_sql(csv_file, db_name, table_name):
    # Read CSV data into a pandas DataFrame

    df = pd.read_csv(csv_file)
    
    # Connect to the SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect(db_name)
    
    # Write the DataFrame to the SQLite database
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    
    # Close the connection
    conn.close()

In [15]:
csv_to_sql('0Rxn_11157_with_smiles.csv', 'mcid2.db', 'kegg_0_rxn')

In [16]:
csv_to_sql('mcidxkegg_1811882_all.csv', 'mcid2.db', 'kegg_1_rxn')  

In [17]:
csv_to_sql('mcidxplant_0rxn.csv', 'mcid2.db', 'plant_0_rxn')