In [1]:
from bioservices import KEGG
import time
import os
import pandas as pd
from rdkit import Chem
from tqdm import tqdm

# Read data

In [12]:
kegg_db = pd.read_csv('0Rxn_11157.csv')
kegg_db = kegg_db.head(50)
compound_ids = kegg_db['mcid'].to_list()

# Get Mol.

In [39]:
def kegg_id_to_mol(kegg_id, kegg_service):
    try:
        mol_data = kegg_service.get(kegg_id, "mol")
        return mol_data
    except Exception as e:
        print(f"Error fetching MOL for {kegg_id}: {e}")
        return None

# Initialize KEGG service
k = KEGG()



In [None]:
comp_info_dir = os.path.join(os.getcwd(), 'comp_info')
os.makedirs(comp_info_dir, exist_ok=True)

# Use tqdm to show progress bar
for cid in tqdm(compound_ids, desc="Processing compounds", unit="compound"):
    mol_data = kegg_id_to_mol(cid, k)
    if mol_data:
        # Create a directory for the compound within comp_info
        compound_dir = os.path.join(comp_info_dir, cid)
        os.makedirs(compound_dir, exist_ok=True)
        
        # Save the MOL file within the directory
        mol_path = os.path.join(compound_dir, f"{cid}.mol")
        with open(mol_path, "w") as f:
            f.write(mol_data)
    time.sleep(1)  # Introduce a delay of 1 second between requests

# Add SMILES

In [None]:
def fetch_mol_from_kegg(kegg_id):
    try:
        mol_data = k.get(kegg_id, "mol")
        return mol_data
    except Exception as e:
        print(f"Error fetching MOL for {kegg_id}: {e}")
        return None

In [None]:
kegg_db['SMILES'] = 'N/A'

In [None]:
for index, row in kegg_db.iterrows():
    mol_data = fetch_mol_from_kegg(row['mcid'])
    if mol_data:
        mol = Chem.MolFromMolBlock(mol_data)
        if mol:  # Check if the MOL data was valid and a molecule was created
            smiles = Chem.MolToSmiles(mol)
            kegg_db.at[index, 'SMILES'] = smiles

In [None]:
kegg_db

In [None]:
kegg_db.to_csv('0rxn_short_100.csv')

# Get Pic

In [3]:
import os
from rdkit import Chem
from rdkit.Chem import Draw
from PIL import Image

def mol_to_image(mol_data, save_path):
    """Convert MOL data to an image and save it."""
    mol = Chem.MolFromMolBlock(mol_data)
    if not mol:
        print(f"Failed to create molecule for data in {save_path}")
        return
    img = Draw.MolToImage(mol, size=(300, 300))
    img.save(save_path)

# Path to the 'comp_info' directory
comp_info_path = os.path.join(os.getcwd(), 'comp_info')

# Iterate over each folder in 'comp_info'
for comp_id in os.listdir(comp_info_path):
    comp_dir = os.path.join(comp_info_path, comp_id)
    if os.path.isdir(comp_dir):
        # Assume the MOL file is named as 'mcid.mol'
        mol_file_path = os.path.join(comp_dir, f"{comp_id}.mol")
        
        if os.path.exists(mol_file_path):
            with open(mol_file_path, 'r') as f:
                mol_data = f.read()
            
            # Save the image as 'mcid_pic.png'
            img_save_path = os.path.join(comp_dir, f"{comp_id}_pic.png")
            mol_to_image(mol_data, img_save_path)

Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C14129/C14129_pic.png


RDKit ERROR: [22:07:04] Explicit valence for atom # 8 O, 3, is greater than permitted
[22:07:04] Explicit valence for atom # 8 O, 3, is greater than permitted


Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C19600/C19600_pic.png
Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C15778/C15778_pic.png
Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C19751/C19751_pic.png
Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C20797/C20797_pic.png


[22:07:37] Unrecognized radical value 0 for atom 0 on line 31



Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C01041/C01041_pic.png
Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C20798/C20798_pic.png
Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C05845/C05845_pic.png


RDKit ERROR: [22:08:05] Explicit valence for atom # 11 B, 7, is greater than permitted
[22:08:05] Explicit valence for atom # 11 B, 7, is greater than permitted


Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C13681/C13681_pic.png


RDKit ERROR: [22:08:29] Explicit valence for atom # 25 N, 4, is greater than permitted
[22:08:29] Explicit valence for atom # 25 N, 4, is greater than permitted


Failed to create molecule for data in /Users/bowen/Desktop/web_app/database/comp_info/C02202/C02202_pic.png


In [15]:
import os
from rdkit import Chem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from tqdm import tqdm
from PIL import Image, ImageDraw, ImageFont

def create_missing_image(save_path):
    """Create an image with the text 'Missing'."""
    img = Image.new('RGB', (600, 600))
    d = ImageDraw.Draw(img)
    fnt_size = 45
    try:
        fnt = ImageFont.truetype('arial.ttf', fnt_size)
    except IOError:
        fnt = ImageFont.load_default()
    text_w, text_h = d.textsize("Missing", font=fnt)
    d.text(((600 - text_w) / 2, (600 - text_h) / 2), "Missing", font=fnt, fill=(255, 0, 0))
    img.save(save_path)

def mol_to_image(mol_data, save_path, atom_color=None, resolution=600):
    """Convert MOL data to an image and save it with custom atom colors."""
    mol = Chem.MolFromMolBlock(mol_data)
    if not mol:
        create_missing_image(save_path)
        return
    
    rdDepictor.SetPreferCoordGen(True)
    drawer = rdMolDraw2D.MolDraw2DCairo(resolution, resolution)
    
    if atom_color:
        color_map = {i.GetIdx(): atom_color for i in mol.GetAtoms()}
        drawer.DrawMolecule(mol, highlightAtoms=color_map.keys(), highlightAtomColors=color_map)
    else:
        drawer.DrawMolecule(mol)
        
    drawer.FinishDrawing()
    img_data = drawer.GetDrawingText()
    
    with open(save_path, 'wb') as f:
        f.write(img_data)

# Path to the 'comp_info' directory
comp_info_path = os.path.join(os.getcwd(), 'comp_info')

# Create a list of directories to process
dirs_to_process = [os.path.join(comp_info_path, comp_id) for comp_id in os.listdir(comp_info_path) if os.path.isdir(os.path.join(comp_info_path, comp_id))]

# Iterate over each folder in 'comp_info' with a processing bar
for comp_dir in tqdm(dirs_to_process, desc="Processing molecules"):
    comp_id = os.path.basename(comp_dir)
    # Assume the MOL file is named as 'mcid.mol'
    mol_file_path = os.path.join(comp_dir, f"{comp_id}.mol")
    
    if os.path.exists(mol_file_path):
        with open(mol_file_path, 'r') as f:
            mol_data = f.read()

        # Save the image as 'mcid_pic.png'
        img_save_path = os.path.join(comp_dir, f"{comp_id}_pic.png")
        mol_to_image(mol_data, img_save_path, atom_color=(200, 100, 100), resolution=600)
    else:
        create_missing_image(img_save_path)

Processing molecules:   5%|▉                | 607/11156 [00:10<03:03, 57.41it/s]RDKit ERROR: [22:28:02] Explicit valence for atom # 8 O, 3, is greater than permitted
[22:28:02] Explicit valence for atom # 8 O, 3, is greater than permitted
[22:29:09] Unrecognized radical value 0 for atom 0 on line 31

Processing molecules:  70%|███████████▏    | 7823/11156 [02:15<01:00, 55.49it/s]RDKit ERROR: [22:30:07] Explicit valence for atom # 11 B, 7, is greater than permitted
[22:30:07] Explicit valence for atom # 11 B, 7, is greater than permitted
Processing molecules:  97%|██████████████▌| 10794/11156 [03:09<00:06, 55.42it/s]RDKit ERROR: [22:31:01] Explicit valence for atom # 25 N, 4, is greater than permitted
[22:31:01] Explicit valence for atom # 25 N, 4, is greater than permitted
Processing molecules: 100%|███████████████| 11156/11156 [03:15<00:00, 56.98it/s]


# Get pic

In [44]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
import os
from tqdm import tqdm
from PIL import Image, ImageDraw, ImageFont


def mol_to_svg(mol_file_path, svg_file_path):
    mol = Chem.MolFromMolFile(mol_file_path)
    if not mol:
        print("Error: Couldn't read the molecule from the provided .mol file.")
        return

    # Using a molecular drawer
    drawer = rdMolDraw2D.MolDraw2DSVG(500, 500)  # Here, 300x300 is the image size. You can adjust as needed.
    drawer.DrawMolecule(mol)
    drawer.FinishDrawing()
    svg_data = drawer.GetDrawingText().replace("svg:", "")

    with open(svg_file_path, "w") as svg_file:
        svg_file.write(svg_data)
        
def create_missing_image(svg_file_path):
    """Generate a gray SVG with the text 'Missing'."""
    width, height = 300, 300
    img = Image.new('RGBA', (width, height), (0, 0, 0, 0))  # Transparent background

    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype("arial.ttf", 40)  # Use a suitable font
    text_width, text_height = draw.textsize("Missing", font=font)
    draw.text(((width - text_width) / 2, (height - text_height) / 2), "Missing", fill="gray", font=font)

    with open(svg_file_path, "wb") as f:
        img.save(f, "SVG")

def process_compounds(base_path='comp_info'):
    for folder in tqdm(os.listdir(base_path)):
        folder_path = os.path.join(base_path, folder)
        if os.path.isdir(folder_path):
            mol_file = os.path.join(folder_path, f"{folder}.mol")
            svg_file = os.path.join(folder_path, f"{folder}_pic.svg")
            try:
                if os.path.exists(mol_file):
                    mol_to_svg(mol_file, svg_file)
                else:
                    create_missing_image(svg_file)
            except Exception as e:
                print(f"Error processing {folder}: {e}")
                create_missing_image(svg_file)

process_compounds()

In [45]:
# Example
mol_to_svg('comp_info/C00002/C00002.mol', 'comp_info/C00002/C00002_pic.svg')

# Get Class

In [15]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import pubchempy as pcp

df = pd.read_csv('mcidxkegg_1811882_all.csv')
df = df.head(100)

In [17]:


# Define SMARTS patterns for different functional groups
functional_groups_smarts = {
    'amine': '[NX3;H2,H1,H0;!$(NC=O)]',
    'phenol': 'c1cc(ccc1O)O',
    'hydroxyl': '[OX2H]',
    'carboxyl': 'C(=O)[OX1H0-,OX2H1]',
    'carbonyl': '[CX3]=[OX1]'
}

def identify_functional_groups(mol):
    groups_present = {}
    for group_name, smarts in functional_groups_smarts.items():
        substructure = Chem.MolFromSmarts(smarts)
        groups_present[group_name] = mol.HasSubstructMatch(substructure)
    return groups_present

def smiles2info(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            formula = rdMolDescriptors.CalcMolFormula(mol)
            groups = identify_functional_groups(mol)
            compounds = pcp.get_compounds(smiles, namespace='smiles')
            if compounds:
                match = compounds[0]
                common_name = match.synonyms[0] if match.synonyms else None
                iupac_name = match.iupac_name if match.iupac_name else None
            else:
                common_name, iupac_name = None, None
            return [common_name, iupac_name, formula] + list(groups.values())
    except Exception as e:
        print(f"Error with SMILES {smiles}: {e}")
        return [smiles, None, None] + [None] * len(functional_groups_smarts)

# Parallel computation with progress bar
num_cores = -1  # Use all available cores
results = Parallel(n_jobs=num_cores)(delayed(smiles2info)(smile) for smile in tqdm(df['smiles'], total=df.shape[0]))

# Unpack the results and create new columns in the DataFrame
columns = ['CommonName', 'IUPACName', 'Formula'] + list(functional_groups_smarts.keys())
for i, col in enumerate(columns):
    df[col] = [result[i] for result in results]

# Save the updated DataFrame to a new CSV file
# df.to_csv('mcidxkegg_with_name_and_functional_groups.csv', index=False)



  0%|                                                   | 0/100 [00:00<?, ?it/s][A
  8%|███▍                                       | 8/100 [00:00<00:03, 26.27it/s][A
 16%|██████▋                                   | 16/100 [00:02<00:15,  5.56it/s][A
 24%|██████████                                | 24/100 [00:04<00:15,  4.92it/s][A
 32%|█████████████▍                            | 32/100 [00:06<00:14,  4.74it/s][A
 40%|████████████████▊                         | 40/100 [00:08<00:14,  4.03it/s][A
 48%|████████████████████▏                     | 48/100 [00:10<00:12,  4.07it/s][A
 56%|███████████████████████▌                  | 56/100 [00:12<00:10,  4.01it/s][A
 64%|██████████████████████████▉               | 64/100 [00:15<00:10,  3.47it/s][A
 72%|██████████████████████████████▏           | 72/100 [00:18<00:08,  3.27it/s][A
 80%|█████████████████████████████████▌        | 80/100 [00:22<00:07,  2.83it/s][A
 88%|████████████████████████████████████▉     | 88/100 [00:24<00:03,  3.09

In [18]:
df

Unnamed: 0,mcid,exact_mass,possible_reaction,smiles,substrate_id,substrate_name,substrate_name_all,substrate_formula,substrate_exact_mass,substrate_mol_weight,...,kegg_pathway,kegg_module,CommonName,IUPACName,Formula,amine,phenol,hydroxyl,carboxyl,carbonyl
0,C00002R01001,504.980095,R01,O=P(O)(O)OP(=O)(O)OP(=O)(O)OC=C1OC(N2C=NC=3C(=...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C10H14N5O13P3,True,False,True,False,False
1,C00002R14001,534.990660,R14,O=COC1C(O)C(OC1COP(=O)(O)OP(=O)(O)OP(=O)(O)O)N...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,O=COC1C(O)C(OC1COP(=O)(O)OP(=O)(O)OP(=O)(O)O)N...,,,,,,,
2,C00002R14002,534.990660,R14,O=CC1(O)C(O)C(OC1COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C11H16N5O14P3,True,False,True,False,True
3,C00002R14003,534.990660,R14,O=CC1=NC=2C(=NC=NC2N1C3OC(COP(=O)(O)OP(=O)(O)O...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C11H16N5O14P3,True,False,True,False,True
4,C00002R14004,534.990660,R14,O=CC1(OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C11H16N5O14P3,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,C00002R64002,749.014899,R64,O=P(O)(O)OCC1OC(OC2C(O)C(OC2N3C=NC=4C(=NC=NC43...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C16H27N5O21P4,True,False,True,False,False
96,C00002R65001,257.909556,R65,O=P(O)(O)OP(=O)(O)OP(=O)(O)O,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,Triphosphoric acid,diphosphono hydrogen phosphate,H5O10P3,False,False,True,False,False
97,C00002R66001,756.081935,R66,O=P(O)(O)OP(=O)(O)OP(=O)(O)OCC1OC(N2C=NC=3C(=N...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C20H27N10O16P3,True,False,True,False,False
98,C00002R66002,756.081935,R66,O=P(O)(O)OP(=O)(O)OP(=O)(O)OCC1OC(N2C=NC=3C(=N...,C00002,ATP,ATP;_Adenosine 5'-triphosphate,C10H16N5O13P3,506.9957,507.181,...,True,True,,,C20H27N10O16P3,True,False,True,False,False


Error with SMILES O=COC1C(O)C(OC1COP(=O)(O)OP(=O)(O)OP(=O)(O)O)N2C=NC=3C(=NC=NC32)N: 'PUGREST.ServerBusy'
