In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem as Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdMolDescriptors
import CBRdb

In [None]:
data = pd.read_csv("../data/atlas_data_R.csv.zip")
print("data columns", data.columns, flush=True)
# check how many of kegg_id are present
# print("kegg_id", data["kegg_id"].nunique(), flush=True)
print(data.head(), flush=True)

In [None]:
# select only the data which has a non NaN kegg_id
data_tmp = data[data["kegg_id"].isna()]
# Sort the data by the kegg_id
#data_tmp = data_tmp.sort_values(by="kegg_id")

print(data_tmp.head(), flush=True)

In [None]:
# Look at the compound data
data_c = pd.read_csv("../data/kegg_data_C.csv.zip")
print("data columns", data_c.columns, flush=True)
# sort the compound so that the lowest 10 molecular weight are at the top
data_c = data_c.sort_values(by="molecular_weight")
# print only the compound_id and the smiles
print(data_c[["compound_id", "smiles", "formula"]].head(50), flush=True)

In [None]:
smi = "[H:0]/C=C/[H:0]"
print(f"input: {smi}", flush=True)
# replace all occurrences of [H:0] with *
smi = smi.replace("[H:0]", "*")
print(f"replaced: {smi}", flush=True)

mol = Chem.MolFromSmiles(smi)
mol = CBRdb.standardize_mol(mol)
Chem.Kekulize(mol)
smi_out = Chem.MolToSmiles(mol)
print(f"output: {smi_out}", flush=True)
# print the formula
print(f"formula: {rdMolDescriptors.CalcMolFormula(mol)}", flush=True)

In [None]:
sol = "1 C00001 + 2 C00002 <=> 3 C00008 + 4 C00009"
eq = "-2 C00002 + 1 C00001 <=> 3 C00008 + 4 C00009"
r, p = CBRdb.eq_to_dict(eq)
# sort the dictionary
print(r, flush=True)
print(r, flush=True)
eq_new = CBRdb.dicts_to_eq(r, p)
print(eq_new, flush=True)
# assert sol == eq_new