In [8]:
import plinder as pl
import numpy as np
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import glob


In [2]:
BASE_DIR = '/mnt/katritch_lab2/aoxu/2024-06/v2/'

In [4]:
df = pd.read_parquet(os.path.join(BASE_DIR, "index/annotation_table.parquet"))
print(df.columns)
print(df.shape)

# print(df.describe())
# print(df.info())

print(df.head())

Index(['entry_pdb_id', 'entry_release_date', 'entry_oligomeric_state',
       'entry_determination_method', 'entry_keywords', 'entry_pH',
       'entry_resolution', 'entry_validation_resolution',
       'entry_validation_rfree', 'entry_validation_r',
       ...
       'system_ligand_has_fragment', 'system_ligand_has_oligo',
       'system_ligand_has_artifact', 'system_ligand_has_other',
       'system_ligand_has_covalent', 'system_ligand_has_invalid',
       'system_ligand_has_ion', 'system_protein_chains_total_length',
       'system_unique_ccd_codes', 'system_proper_unique_ccd_codes'],
      dtype='object', length=743)
(1357906, 743)
  entry_pdb_id entry_release_date entry_oligomeric_state  \
0         3grt         1997-02-12                dimeric   
1         3grt         1997-02-12                dimeric   
2         3grt         1997-02-12                dimeric   
3         3grt         1997-02-12                dimeric   
4         1grx         1993-10-01              monomeric

In [3]:
splits_df = pd.read_parquet(os.path.join(BASE_DIR, "splits/split.parquet"))
print(splits_df.columns)
print(splits_df.shape)
display(splits_df.head())

Index(['system_id', 'uniqueness', 'split', 'cluster', 'cluster_for_val_split',
       'system_pass_validation_criteria', 'system_pass_statistics_criteria',
       'system_proper_num_ligand_chains', 'system_proper_pocket_num_residues',
       'system_proper_num_interactions',
       'system_proper_ligand_max_molecular_weight',
       'system_has_binding_affinity', 'system_has_apo_or_pred'],
      dtype='object')
(409726, 13)


Unnamed: 0,system_id,uniqueness,split,cluster,cluster_for_val_split,system_pass_validation_criteria,system_pass_statistics_criteria,system_proper_num_ligand_chains,system_proper_pocket_num_residues,system_proper_num_interactions,system_proper_ligand_max_molecular_weight,system_has_binding_affinity,system_has_apo_or_pred
0,101m__1__1.A__1.C_1.D,101m__A__C_D_c188899,train,c14,c0,True,True,1,27,20,616.177293,False,False
1,102m__1__1.A__1.C,102m__A__C_c237197,train,c14,c0,True,True,1,26,20,616.177293,False,True
2,103m__1__1.A__1.C_1.D,103m__A__C_D_c252759,train,c14,c0,False,True,1,26,16,616.177293,False,False
3,104m__1__1.A__1.C_1.D,104m__A__C_D_c274687,train,c14,c0,False,True,1,27,21,616.177293,False,False
4,105m__1__1.A__1.C_1.D,105m__A__C_D_c221688,train,c14,c0,False,True,1,28,20,616.177293,False,False


In [4]:
splits_df.groupby('split').size()

split
removed     98718
test         1036
train      309140
val           832
dtype: int64

In [5]:
inputs_df = splits_df[splits_df['split'] == 'test']

In [14]:
inputs_df['protein_path'] = inputs_df['system_id'].apply(lambda x: os.path.join(BASE_DIR, "systems", x, "receptor.pdb"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs_df['protein_path'] = inputs_df['system_id'].apply(lambda x: os.path.join(BASE_DIR, "systems", x, "receptor.pdb"))


## Extract ligand smile and ligand path, get the list of ions if any

In [9]:
def get_ligand_and_ions(ligand_dir: str):
    """
    Find ligand SDF files and differentiate between the true ligand and ions.
    
    Returns a tuple of (ligand_smiles, ligand_path, ion_paths)
    """
    full_path = os.path.join(BASE_DIR, "systems", ligand_dir)
    print(f"Searching for SDF files in: {full_path}")
    
    ligand_files = glob.glob(os.path.join(full_path, "ligand_files", "*.sdf"))
    print(f"Found {len(ligand_files)} SDF files: {ligand_files}")
    
    if len(ligand_files) == 0:
        print(f"No ligand files found for {ligand_dir}")
        return None, None, []
    
    # If only one file, assume it's the ligand
    if len(ligand_files) == 1:
        smiles = extract_smi(ligand_files[0])
        return smiles, ligand_files[0], []
    
    # Multiple files found - need to determine which is the ligand and which are ions
    print(f"Multiple ligand files found for {ligand_dir}: {ligand_files}")
    
    # Process all files to extract molecules and properties
    molecules = []
    for file_path in ligand_files:
        mol = extract_mol(file_path)
        if mol:
            # Calculate molecular properties to help identify ligands vs ions
            mw = Descriptors.MolWt(mol)
            num_atoms = mol.GetNumAtoms()
            smiles = Chem.MolToSmiles(mol)
            charge = Chem.GetFormalCharge(mol)
            
            molecules.append({
                'path': file_path,
                'mol': mol,
                'mw': mw,
                'num_atoms': num_atoms,
                'smiles': smiles,
                'charge': charge
            })
    
    if not molecules:
        print(f"No valid molecules found in any of the files for {ligand_dir}")
        return None, None, []
    
    # Sort molecules by complexity (typically the most complex molecule is the ligand)
    # We use molecular weight and atom count as proxy for complexity
    molecules.sort(key=lambda x: (x['mw'], x['num_atoms']), reverse=True)
    
    # The largest molecule is likely the ligand
    ligand = molecules[0]
    ions = molecules[1:] if len(molecules) > 1 else []
    
    # Get paths for ions
    ion_paths = [ion['path'] for ion in ions]
    
    print(f"Identified likely ligand: {os.path.basename(ligand['path'])} (MW: {ligand['mw']:.2f}, Atoms: {ligand['num_atoms']})")
    if ions:
        print(f"Identified likely ions: {[os.path.basename(ion['path']) for ion in ions]}")
    
    return ligand['smiles'], ligand['path'], ion_paths

def extract_mol(ligand_path: str):
    """Extract the first molecule from an SDF file."""
    try:
        print(f"Processing SDF file: {ligand_path}")
        supplier = Chem.SDMolSupplier(ligand_path)
        
        if supplier is None or len(supplier) == 0 or supplier[0] is None:
            print(f"No valid molecules found in {ligand_path}")
            return None
        
        return supplier[0]
    except Exception as e:
        print(f"Error extracting molecule from {ligand_path}: {str(e)}")
        return None

def extract_smi(ligand_path: str):
    """Extract SMILES from an SDF file."""
    mol = extract_mol(ligand_path)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol)

# Apply the function to get ligand and ion information
results = inputs_df['system_id'].apply(lambda x: get_ligand_and_ions(x))

# Split the results into separate columns
inputs_df['ligand_description'] = results.apply(lambda x: x[0])
inputs_df['ligand_path'] = results.apply(lambda x: x[1])
inputs_df['ion_paths'] = results.apply(lambda x: x[2])

# Debug information
none_count = inputs_df['ligand_description'].isna().sum()
print(f"Number of None values in ligand_description: {none_count} out of {len(inputs_df)}")
print(f"Systems with ions: {sum(inputs_df['ion_paths'].apply(lambda x: len(x) > 0))}")

Searching for SDF files in: /mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F
Found 2 SDF files: ['/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F/ligand_files/1.D.sdf', '/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F/ligand_files/1.F.sdf']
Multiple ligand files found for 1afb__1__1.A__1.D_1.F: ['/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F/ligand_files/1.D.sdf', '/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F/ligand_files/1.F.sdf']
Processing SDF file: /mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F/ligand_files/1.D.sdf
Processing SDF file: /mnt/katritch_lab2/aoxu/2024-06/v2/systems/1afb__1__1.A__1.D_1.F/ligand_files/1.F.sdf
Identified likely ligand: 1.D.sdf (MW: 221.21, Atoms: 15)
Identified likely ions: ['1.F.sdf']
Searching for SDF files in: /mnt/katritch_lab2/aoxu/2024-06/v2/systems/1b5d__1__1.A_1.B__1.D
Found 1 SDF files: ['/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1b5d__1__1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs_df['ligand_description'] = results.apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs_df['ligand_path'] = results.apply(lambda x: x[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs_df['ion_paths'] = results.apply(lambda x: x[2])


In [11]:
inputs_df

Unnamed: 0,system_id,uniqueness,split,cluster,cluster_for_val_split,system_pass_validation_criteria,system_pass_statistics_criteria,system_proper_num_ligand_chains,system_proper_pocket_num_residues,system_proper_num_interactions,system_proper_ligand_max_molecular_weight,system_has_binding_affinity,system_has_apo_or_pred,protein_path,ligand_description,ligand_path,ion_paths
710,1afb__1__1.A__1.D_1.F,1afb__A__D_F_c61933,test,c190,c0,True,True,1,11,5,221.089937,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/1afb...,CC(=O)N[C@@H]1[C@@H](O)[C@@H](O)[C@@H](CO)O[C@...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1af...,[/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1a...
1468,1b5d__1__1.A_1.B__1.D,1b5d__A_B__D_c33816,test,c118,c40,True,True,1,24,12,307.056936,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/1b5d...,Nc1ccn([C@H]2C[C@H](O)[C@@H](COP(=O)(O)O)O2)c(...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1b5...,[]
1704,1bcj__1__1.B__1.I_1.K,1bcj__B__I_K_c259630,test,c190,c0,True,True,1,11,4,221.089937,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/1bcj...,CC(=O)N[C@@H]1[C@@H](O)[C@@H](O)[C@@H](CO)O[C@...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1bc...,[/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1b...
3235,1ci0__1__1.A_1.B__1.D,1ci0__A_B__D_c254020,test,c1048,c713,True,True,1,32,23,456.104615,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/1ci0...,Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1ci...,[]
4701,1d7c__1__1.A_1.B__1.E_1.K,1d7c__A_B__E_K_c323398,test,c1891,c45,True,True,1,30,20,616.177293,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/1d7c...,C=CC1=C(C)C2=Cc3c(C)c(CCC(=O)O)c4n3[Fe@SP2]35<...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1d7...,[/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403431,8tjc__1__1.B__1.N,8tjc__B__N_c165059,test,c1427,c1191,True,True,1,17,7,390.205576,False,True,/mnt/katritch_lab2/aoxu/2024-06/v2/system/8tjc...,C[C@@H](n1c(=O)[nH]c2ccc(C3=NC(=O)[C@](C)(c4cc...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/8tj...,[]
404124,8ttj__4__1.D__1.CA,8ttj__D__CA_c223289,test,c1180,c959,True,True,1,24,9,238.050905,False,True,/mnt/katritch_lab2/aoxu/2024-06/v2/system/8ttj...,N[C@@H](Cc1c[nH]c2cc(Cl)ccc12)C(=O)O,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/8tt...,[]
405850,8uuf__1__1.A__1.B,8uuf__A__B_c294524,test,c659,c456,True,True,1,20,12,486.274324,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/8uuf...,Cc1ccc(OCCN(C)C)cc1C(=O)N[C@H](C)c1cc(-c2cnn(C...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/8uu...,[]
407360,8w6p__1__1.A_1.B__1.E,8w6p__A_B__E_c296023,test,c1183,c921,True,True,1,18,6,408.174395,False,False,/mnt/katritch_lab2/aoxu/2024-06/v2/system/8w6p...,CC(=O)N[C@H]1[C@H](O[C@H]2[C@H](O)[C@@H](NC(C)...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/8w6...,[]


In [15]:
inputs_df['complex_name'] = inputs_df['system_id']
output_file_path = '../forks/gnina/inference/gnina_plinder_benchmark_inputs.csv'

inputs_df[["complex_name", "protein_path", "ligand_description", "ligand_path"]].to_csv(output_file_path, index=False)

print(f"Data saved to {output_file_path}")

Data saved to ../forks/gnina/inference/gnina_plinder_benchmark_inputs.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs_df['complex_name'] = inputs_df['system_id']


In [1]:
import pandas as pd
tmp = pd.read_csv("../forks/DiffDock/inference/diffdock_pocket_only_plinder_benchmark_inputs_subset.csv")

In [3]:
gnina = pd.read_csv("/home/aoxu/projects/PoseBench/forks/GNINA/inference/gnina_plinder_benchmark_inputs_subset.csv")

In [3]:
tmp.head()

Unnamed: 0,complex_name,protein_path,ligand_description,ligand_path,protein_sequence
0,1afb__1__1.A__1.D_1.F,/mnt/katritch_lab2/aoxu/data/plinder/cropped_b...,CC(=O)N[C@@H]1[C@@H](O)[C@@H](O)[C@@H](CO)O[C@...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1af...,
1,1b5d__1__1.A_1.B__1.D,/mnt/katritch_lab2/aoxu/data/plinder/cropped_b...,Nc1ccn([C@H]2C[C@H](O)[C@@H](COP(=O)(O)O)O2)c(...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1b5...,
2,1bcj__1__1.B__1.I_1.K,/mnt/katritch_lab2/aoxu/data/plinder/cropped_b...,CC(=O)N[C@@H]1[C@@H](O)[C@@H](O)[C@@H](CO)O[C@...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1bc...,
3,1ci0__1__1.A_1.B__1.D,/mnt/katritch_lab2/aoxu/data/plinder/cropped_b...,Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1ci...,
4,1d7c__1__1.A_1.B__1.E_1.K,/mnt/katritch_lab2/aoxu/data/plinder/cropped_b...,C=CC1=C(C)C2=Cc3c(C)c(CCC(=O)O)c4n3[Fe@SP2]35<...,/mnt/katritch_lab2/aoxu/2024-06/v2/systems/1d7...,
