In [1]:
from posebusters.posebusters import PoseBusters
from rdkit import Chem
import os
import pandas as pd

In [None]:

def run_posebusters_for_approach(
    method_name: str,
    base_outdir: str,
    data_dir: str,
    rank_file: str = "rank1.sdf",
    docking_prop: str = "Score",  # or "Docking_Score"
    docking: bool = False,
) -> pd.DataFrame:
    """
    Scans `base_outdir` for subdirectories (each protein),
    loads the predicted SDF (default is rank1.sdf),
    loads the 'true' ligand and protein from `data_dir`,
    runs PoseBusters, and returns a DataFrame with columns
    like: [protein, rmsd, method, docking_score, ...].
    """
    pb = PoseBusters(config="redock", top_n=None)
    all_results = []

    # Loop over each protein subdirectory in base_outdir
    for protein_name in os.listdir(base_outdir):
        protein_dir = os.path.join(base_outdir, protein_name)
        if not os.path.isdir(protein_dir):
            continue

        # Predicted pose file
        # Could also do: `answers_sdf = os.path.join(protein_dir, f"answers_{protein_name}.sdf")`
        answers_sdf = os.path.join(protein_dir, rank_file)
        if not os.path.isfile(answers_sdf):
            print(f"[{method_name}] [SKIP] No {rank_file} found for {protein_name}")
            continue

        # True ligand and PDB paths
        true_ligand = os.path.join(data_dir, protein_name, f"{protein_name}_ligand.sdf")
        protein_pdb = os.path.join(data_dir, protein_name, f"{protein_name}_protein.pdb")

        if not os.path.isfile(true_ligand):
            print(f"[{method_name}] [SKIP] True ligand not found: {true_ligand}")
            continue
        if not os.path.isfile(protein_pdb):
            print(f"[{method_name}] [SKIP] Protein PDB not found: {protein_pdb}")
            continue

        # Run PoseBusters
        try:
            print(f"\n[{method_name}] [INFO] Running PoseBusters for protein={protein_name}")
            df_pb = pb.bust(
                mol_pred=answers_sdf,
                mol_true=true_ligand,
                mol_cond=protein_pdb,
                full_report=True
            )
            df_pb["protein"] = protein_name
            df_pb["method"] = method_name  # key column!

            # Optionally parse docking score
            if docking:
                # Read scores from the SDF property (e.g., "Score" or "Docking_Score")
                suppl = Chem.SDMolSupplier(answers_sdf, removeHs=False)
                docking_scores = []
                for mol in suppl:
                    if mol is not None and docking_prop in mol.GetPropNames():
                        sc_str = mol.GetProp(docking_prop)
                        docking_scores.append(float(sc_str))
                    else:
                        docking_scores.append(None)
                # Must confirm that len(docking_scores) == len(df_pb), or handle mismatch
                if len(docking_scores) != len(df_pb):
                    print(f"[{method_name}] [WARN] Mismatch: {len(docking_scores)} scores vs {len(df_pb)} PoseBusters rows")
                else:
                    df_pb["docking_score"] = docking_scores

            # Save per-protein CSV if desired
            out_csv = os.path.join(protein_dir, f"bust_results_{protein_name}.csv")
            df_pb.to_csv(out_csv, index=False)
            print(f"[{method_name}] [INFO] Saved {out_csv}")

            all_results.append(df_pb)

        except Exception as e:
            print(f"[{method_name}] [ERROR] PoseBusters failed for {protein_name}: {str(e)}")

    # Combine results for this method
    if all_results:
        method_df = pd.concat(all_results, ignore_index=True)
    else:
        method_df = pd.DataFrame()  # empty if no results

    return method_df

In [None]:
# The folder containing subdirectories like 5S8I_2LY, 5SD5_HWI, etc.
base_outdir = "/Users/aoxu/projects/DrugDiscovery/PoseBench/forks"

# The folder containing the real (crystal) ligand and protein PDB:
#   /Users/aoxu/projects/DrugDiscovery/PoseBench/data/posebusters_benchmark_set/<protein>/
data_dir = "/Users/aoxu/projects/DrugDiscovery/PoseBench/data/posebusters_benchmark_set"

exp_name = "posebusters_benchmark_outputs_1"

In [None]:
# Example: we have two or three methods:
methods_info = [
    {
        "method_name": "icm",
        "base_outdir": f"{base_outdir}/ICM/inference/icm_manual_{exp_name}",
        "data_dir": f"{data_dir}",
        "rank_file": "rank1.sdf",
        "docking_prop": "Score",  # or "Docking_Score"
        "docking": True
    },
    {
        "method_name": "diffdock",
        "base_outdir": f"{base_outdir}/Diffdock",
        "data_dir": f"{data_dir}",
        "rank_file": "rank1.sdf",
        "docking_prop": "Score",  # If DiffDock uses same property name
        "docking": True
    },
    {
        "method_name": "chai-1",
        "base_outdir": f"{base_outdir}/chai-lab/chai-lab_{exp_name}",
        "data_dir": f"{data_dir}",
        "rank_file": "pred.model_idx_0_ligand_aligned.sdf",
        "docking_prop": "minimizedAffinity",
        "docking": True
    },
    # Add more approaches as needed
]

all_methods_dfs = []

for info in methods_info:
    df_method = run_posebusters_for_approach(
        method_name=info["method_name"],
        base_outdir=info["base_outdir"],
        data_dir=info["data_dir"],
        rank_file=info["rank_file"],
        docking_prop=info["docking_prop"],
        docking=info["docking"],
    )
    all_methods_dfs.append(df_method)

# Now combine them
df_all = pd.concat(all_methods_dfs, ignore_index=True)
print(df_all.shape)
df_all.head()

# If you want to save a single aggregated CSV:
df_all.to_csv("/path/to/all_methods_bust_results.csv", index=False)