In [5]:
import gzip
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
import pyarrow as pa
from dask.distributed import Client
from dask.distributed import LocalCluster
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import rdMolTransforms

from confscale.metadata import get_etkdg_info

In [None]:
# Create a local cluster with 18 workers
cluster = LocalCluster(n_workers=18)

# Create a client connected to the cluster
client = Client(cluster)

# Display client information
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 32947 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:32947/status,

0,1
Dashboard: http://127.0.0.1:32947/status,Workers: 18
Total threads: 36,Total memory: 31.03 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37663,Workers: 0
Dashboard: http://127.0.0.1:32947/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:46641,Total threads: 2
Dashboard: http://127.0.0.1:34097/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:35233,
Local directory: /tmp/dask-scratch-space/worker-4cymuih3,Local directory: /tmp/dask-scratch-space/worker-4cymuih3

0,1
Comm: tcp://127.0.0.1:44711,Total threads: 2
Dashboard: http://127.0.0.1:37357/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:34135,
Local directory: /tmp/dask-scratch-space/worker-kel528r2,Local directory: /tmp/dask-scratch-space/worker-kel528r2

0,1
Comm: tcp://127.0.0.1:34757,Total threads: 2
Dashboard: http://127.0.0.1:45665/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:45027,
Local directory: /tmp/dask-scratch-space/worker-3ly_o0_3,Local directory: /tmp/dask-scratch-space/worker-3ly_o0_3

0,1
Comm: tcp://127.0.0.1:33991,Total threads: 2
Dashboard: http://127.0.0.1:39121/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:36635,
Local directory: /tmp/dask-scratch-space/worker-bxgtoz_m,Local directory: /tmp/dask-scratch-space/worker-bxgtoz_m

0,1
Comm: tcp://127.0.0.1:35057,Total threads: 2
Dashboard: http://127.0.0.1:40895/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:39913,
Local directory: /tmp/dask-scratch-space/worker-3vtyxnje,Local directory: /tmp/dask-scratch-space/worker-3vtyxnje

0,1
Comm: tcp://127.0.0.1:40109,Total threads: 2
Dashboard: http://127.0.0.1:34683/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:41985,
Local directory: /tmp/dask-scratch-space/worker-kliuzs_x,Local directory: /tmp/dask-scratch-space/worker-kliuzs_x

0,1
Comm: tcp://127.0.0.1:37109,Total threads: 2
Dashboard: http://127.0.0.1:38143/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:37751,
Local directory: /tmp/dask-scratch-space/worker-rba8k60t,Local directory: /tmp/dask-scratch-space/worker-rba8k60t

0,1
Comm: tcp://127.0.0.1:43285,Total threads: 2
Dashboard: http://127.0.0.1:33625/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:38511,
Local directory: /tmp/dask-scratch-space/worker-03o7f_jz,Local directory: /tmp/dask-scratch-space/worker-03o7f_jz

0,1
Comm: tcp://127.0.0.1:35097,Total threads: 2
Dashboard: http://127.0.0.1:44195/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:46293,
Local directory: /tmp/dask-scratch-space/worker-oq8zj89_,Local directory: /tmp/dask-scratch-space/worker-oq8zj89_

0,1
Comm: tcp://127.0.0.1:42945,Total threads: 2
Dashboard: http://127.0.0.1:36855/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:42701,
Local directory: /tmp/dask-scratch-space/worker-qkiesqh4,Local directory: /tmp/dask-scratch-space/worker-qkiesqh4

0,1
Comm: tcp://127.0.0.1:43809,Total threads: 2
Dashboard: http://127.0.0.1:37533/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:39437,
Local directory: /tmp/dask-scratch-space/worker-oy5zw9mz,Local directory: /tmp/dask-scratch-space/worker-oy5zw9mz

0,1
Comm: tcp://127.0.0.1:46393,Total threads: 2
Dashboard: http://127.0.0.1:45925/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:35849,
Local directory: /tmp/dask-scratch-space/worker-q9a34l1t,Local directory: /tmp/dask-scratch-space/worker-q9a34l1t

0,1
Comm: tcp://127.0.0.1:41379,Total threads: 2
Dashboard: http://127.0.0.1:35109/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:34647,
Local directory: /tmp/dask-scratch-space/worker-6ih9ti0t,Local directory: /tmp/dask-scratch-space/worker-6ih9ti0t

0,1
Comm: tcp://127.0.0.1:33407,Total threads: 2
Dashboard: http://127.0.0.1:37567/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:33727,
Local directory: /tmp/dask-scratch-space/worker-kpjmxo40,Local directory: /tmp/dask-scratch-space/worker-kpjmxo40

0,1
Comm: tcp://127.0.0.1:43813,Total threads: 2
Dashboard: http://127.0.0.1:42427/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:33329,
Local directory: /tmp/dask-scratch-space/worker-m6llxy13,Local directory: /tmp/dask-scratch-space/worker-m6llxy13

0,1
Comm: tcp://127.0.0.1:36129,Total threads: 2
Dashboard: http://127.0.0.1:34295/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:37851,
Local directory: /tmp/dask-scratch-space/worker-bfqstbpu,Local directory: /tmp/dask-scratch-space/worker-bfqstbpu

0,1
Comm: tcp://127.0.0.1:41319,Total threads: 2
Dashboard: http://127.0.0.1:43439/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:40065,
Local directory: /tmp/dask-scratch-space/worker-abaohkkq,Local directory: /tmp/dask-scratch-space/worker-abaohkkq

0,1
Comm: tcp://127.0.0.1:39519,Total threads: 2
Dashboard: http://127.0.0.1:33119/status,Memory: 1.72 GiB
Nanny: tcp://127.0.0.1:36653,
Local directory: /tmp/dask-scratch-space/worker-_kyx8_gp,Local directory: /tmp/dask-scratch-space/worker-_kyx8_gp




In [7]:
def compressedsdf2mols(filepath: str | Path) -> list[Chem.Mol]:
    """
    Read molecules from a compressed SDF file.

    Parameters
    ----------
    filepath : str
       Path to the compressed SDF file (.sdf.gz)

    Returns
    -------
    list[Chem.Mol]
       List of RDKit molecule objects loaded from the file,
       excluding any None entries (failed molecule parsing)
    """
    # initiate
    mols = []
    # Set the RDKit logger to only show critical messages
    RDLogger.logger().setLevel(RDLogger.CRITICAL)
    with gzip.open(filepath) as gzf:
        suppl = Chem.ForwardSDMolSupplier(gzf)
        mols = [mol for mol in suppl if mol is not None]
    return mols

In [8]:
def process_molecule(mol: Chem.Mol) -> list[dict]:
    """
    Process a molecule to extract dihedral angle information.

    Parameters
    ----------
    mol : Chem.Mol
        RDKit molecule object with 3D coordinates.

    Returns
    -------
    list[dict]
        List of dictionaries containing dihedral angle information with keys:
        - identifier (string): CSD identifier of the molecule
        - smarts (string): SMARTS pattern identifying the rotatable bond
        - angle (np.float32): Dihedral angle in degrees (0-360)

    Notes
    -----
    This function extracts torsion angles from the molecule using ETKDG information.
    Angles are normalized to the range [0, 360] degrees.
    """
    # init
    results = []

    # get torsin angles infos
    try:
        infos = get_etkdg_info(mol)
    except Exception as e:
        print(f"Error getting ETKDG info: {e}")
        return results

    # get CSD identifier
    identifier = mol.GetProp("CSD_Entry_Name") if mol.HasProp("CSD_Entry_Name") else None

    # iterate on the infos

    for angle_info in infos:
        # loop variable
        temp_dict = {
            "identifier": identifier,
            "smarts": angle_info["smarts"],
            "angle": np.float32(rdMolTransforms.GetDihedralDeg(mol.GetConformer(), *angle_info["atomIndices"])),
        }
        temp_dict["angle"] = temp_dict["angle"] + 360 if temp_dict["angle"] < 0 else temp_dict["angle"]
        results.append(temp_dict)

    return results


def partition_wrapper(partition: pd.DataFrame) -> pd.DataFrame:
    """
    Process a partition of a Dask DataFrame containing paths to SDF files.

    Parameters
    ----------
    partition : pd.DataFrame
        DataFrame partition with a 'filepath' column containing paths to compressed SDF files.

    Returns
    -------
    pd.DataFrame
        DataFrame containing extracted dihedral angle information with columns:
        - identifier (str): CSD identifier of the molecule
        - smarts (str): SMARTS pattern identifying the rotatable bond
        - angle (float32): Dihedral angle in degrees normalized to [0, 360]

    Notes
    -----
    For this function to work as intended the partition should be a single row DataFrame.
    This function processes each molecule in the SDF file to extract dihedral angles
    using the process_molecule function. It's designed to be used with Dask's map_partitions.
    """

    # instantiate variables
    results = []

    #  retrieve path to compressed SDF files
    sdf_gz_path = partition["filepath"].values[0]

    # read the compressed SDF file
    mols = compressedsdf2mols(sdf_gz_path)

    # process the molecules
    for mol in mols:
        temp_results = process_molecule(mol)
        if temp_results:
            results.extend(temp_results)

    # cast as DataFrame
    df = pd.DataFrame(results)

    return df

In [4]:
datapath = Path("../data/processed/csd_sdf_batches/")
schema = pa.schema(
    [("identifier", pa.string()), ("smarts", pa.string()), ("angle", pa.float32())],
    metadata={
        "description": "Dataset containing molecular dihedral angle information",
        "identifier_description": "CSD identifier of the molecule",
        "smarts_description": "SMARTS patterns identifying rotatable bonds",
        "angle_description": "Dihedral angle in degrees (0-360)",
        "source": "CSD dataset processed with RDKit",
        "creator": "confscale pipeline",
    },
)

In [9]:
df = pd.DataFrame(datapath.glob("*.sdf.gz"), columns=["filepath"])

In [10]:
ddf = dd.from_pandas(df, npartitions=df.shape[0])
ddf = ddf.map_partitions(partition_wrapper, meta={"identifier": "str", "smarts": "str", "angle": "float32"})
ddf.to_parquet("../data/processed/csd_dihedral_angles", schema=schema)

In [11]:
results_df = pd.concat([pd.read_parquet(path) for path in Path("../data/processed/csd_dihedral_angles").glob("*.parquet")])

In [12]:
results_df.to_parquet(datapath / "csd_dehydrals.parquet")