In [11]:

import rdkit
from rdkit import Chem
from rdkit.Chem import rdSubstructLibrary
import pandas as pd
print(rdkit.__version__)


def build_chembl_library(chemreps_path, chunksize=100_000):
    """
    Build an RDKit SubstructLibrary for all molecules in chembl_XX_chemreps.txt.

    Returns
    -------
    lib : rdSubstructLibrary.SubstructLibrary
        Library containing all molecules (with fingerprints).
    meta : list[tuple[str, str]]
        List mapping internal index -> (chembl_id, canonical_smiles).
    """
    lib = rdSubstructLibrary.SubstructLibrary()

    meta = []  # index -> (chembl_id, smiles)
    i = 0

    for chunk in pd.read_csv(chemreps_path, sep="\t", chunksize=chunksize):
        print(f"chunk {i}")
        i+=1
        if "chembl_id" not in chunk.columns or "canonical_smiles" not in chunk.columns:
            raise KeyError("Expected 'chembl_id' and 'canonical_smiles' in chemreps file")


        for j, row in chunk.iterrows():
            #print(f"smi {j}")
            
            smi = row["canonical_smiles"]
            if not isinstance(smi, str):
                continue

            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                continue

            lib.AddMol(mol)
            meta.append((row["chembl_id"], smi))


        break

    return lib, meta

def search_library_for_patterns(lib, meta, patterns, num_threads=8):
    """
    Use SubstructLibrary to find all molecules that match any ncAA pattern.

    Parameters
    ----------
    lib : rdSubstructLibrary.SubstructLibrary
        Library built by build_chembl_library.
    meta : list[tuple[str, str]]
        index -> (chembl_id, smiles).
    patterns : list[(str, rdkit.Chem.Mol)]
        (ncaa_name, RDKit Mol pattern from SMARTS).
    num_threads : int
        Number of threads for RDKit's internal matching.

    Returns
    -------
    hits_df : pd.DataFrame
        Columns: chembl_id, canonical_smiles, matched_ncaa (comma-joined names).
    """
    # index -> set of ncAA names that matched
    idx_to_ncaa = {}

    for name, pat in patterns:
        print(name)
        if pat is None:
            continue

        # SubstructLibrary uses FP prefilter + substructure search
        mol_indices = lib.GetMatches(pat, numThreads=num_threads)
        print(len(mol_indices))
        for idx in mol_indices:
            print(idx)
            if idx not in idx_to_ncaa:
                idx_to_ncaa[idx] = set()
            idx_to_ncaa[idx].add(name)

    # Convert into a flat DataFrame
    rows = []
    for idx, ncaa_names in idx_to_ncaa.items():
        chembl_id, smi = meta[idx]
        rows.append(
            {
                "chembl_id": chembl_id,
                "canonical_smiles": smi,
                "matched_ncaa": ",".join(sorted(ncaa_names)),
            }
        )

    if not rows:
        return pd.DataFrame(columns=["chembl_id", "canonical_smiles", "matched_ncaa"])

    return pd.DataFrame(rows)



2025.09.3


In [12]:
lib, meta = build_chembl_library('uaa_data/chembl_36_chemreps.txt', chunksize=100_000)

print(lib)

chunk 0


[10:38:16] Explicit valence for atom # 13 P, 7, is greater than permitted
[10:38:20] Explicit valence for atom # 29 P, 7, is greater than permitted
[10:38:22] Explicit valence for atom # 91 P, 7, is greater than permitted


<rdkit.Chem.rdSubstructLibrary.SubstructLibrary object at 0x15c303740>


In [None]:
# 2) Search it with your patterns (list of (name, RDKit MolFromSmarts))
hits_df = search_library_for_patterns(
    lib,
    meta,
    patterns,
    num_threads=8,   # or os.cpu_count()
)