In [15]:
from rdkit import Chem

# Read SDF file and convert to SMILES
suppl = Chem.SDMolSupplier('4dli_ligand.sdf')
smiles_list = []

for mol in suppl:
    if mol is not None:  # Check if molecule was successfully parsed
        smiles = Chem.MolToSmiles(mol)
        smiles_list.append(smiles)

# Print all SMILES
for smiles in smiles_list:
    print(smiles)

OSError: File error: Bad input file 4dli_ligand.sdf

In [2]:
from rdkit import Chem
import pandas as pd
import os
import glob


def sdf_to_smiles(file_path):
    """
    Convert an SDF file to a list of SMILES strings.
    
    Args:
        file_path (str): Path to the SDF file
        
    Returns:
        list: List of SMILES strings
    """
    suppl = Chem.SDMolSupplier(file_path)
    return [Chem.MolToSmiles(mol) for mol in suppl if mol is not None]


def process_pdb_folders(csv_file_path, base_directory, output_csv):
    """
    Process PDB folders based on PDB codes from CSV file and extract SMILES strings.
    
    Args:
        csv_file_path (str): Path to CSV file containing PDB codes
        base_directory (str): Path to directory containing PDB folders
        output_csv (str): Path for output CSV file
        
    Returns:
        pandas.DataFrame: DataFrame containing results
    """
    # Read the CSV file to get PDB codes
    df_pdb = pd.read_csv(csv_file_path)
    pdb_codes = df_pdb['PDB_Code'].tolist()
    
    data = []
    processed_count = 0
    missing_folders = []
    
    print(f"Processing {len(pdb_codes)} PDB codes...")
    
    for pdb_code in pdb_codes:
        pdb_folder_path = os.path.join(base_directory, pdb_code)
        
        # Check if folder exists
        if not os.path.exists(pdb_folder_path):
            missing_folders.append(pdb_code)
            continue
            
        # Find all SDF files in the PDB folder
        sdf_files = glob.glob(os.path.join(pdb_folder_path, "*.sdf"))
        
        if not sdf_files:
            print(f"Warning: No SDF files found in {pdb_folder_path}")
            continue
            
        for sdf_file in sdf_files:
            try:
                smiles_list = sdf_to_smiles(sdf_file)
                
                # Add each SMILES as a separate row
                for i, smiles in enumerate(smiles_list):
                    data.append({
                        'PDB_Code': pdb_code,
                        'SDF_File': os.path.basename(sdf_file),
                        'Molecule_Index': i,
                        'SMILES': smiles
                    })
                    
                processed_count += 1
                
            except Exception as e:
                print(f"Error processing {sdf_file}: {e}")
    
    # Create DataFrame and save to CSV
    df_results = pd.DataFrame(data)
    
    if not df_results.empty:
        df_results.to_csv(output_csv, index=False)
        print(f"\nResults saved to {output_csv}")
        print(f"Total SMILES extracted: {len(df_results)}")
        print(f"Processed folders: {processed_count}")
    else:
        print("No SMILES strings were extracted!")
    
    if missing_folders:
        print(f"\nMissing folders for PDB codes: {missing_folders[:10]}...")  # Show first 10
        if len(missing_folders) > 10:
            print(f"... and {len(missing_folders) - 10} more")
    
    return df_results

df1 = process_pdb_folders(
        csv_file_path='pdbind_data_demo.csv',
        base_directory='/Users/bodenretherford/Desktop/demo',  # Update this path
        output_csv='pdb_smiles_detailed.csv'
    )
print(df1.head())


Processing 285 PDB codes...


[14:19:55] Explicit valence for atom # 3 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 53
[14:19:55] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 6 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 143
[14:19:55] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 11 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 104
[14:19:55] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 7 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 111
[14:19:55] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 1 O, 3, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending o


Results saved to pdb_smiles_detailed.csv
Total SMILES extracted: 265
Processed folders: 285
  PDB_Code         SDF_File  Molecule_Index  \
0     4tmn  4tmn_ligand.sdf               0   
1     5tmn  5tmn_ligand.sdf               0   
2     1ydr  1ydr_ligand.sdf               0   
3     1ydt  1ydt_ligand.sdf               0   
4     1bcu  1bcu_ligand.sdf               0   

                                              SMILES  
0  CC(C)C[C@H](N[P@](=O)([O-])[C@H](Cc1ccccc1)NC(...  
1  CC(C)C[C@H](NC(=O)[C@H](CC(C)C)N[P@@](=O)([O-]...  
2          C[C@H]1C[NH2+]CCN1S(=O)(=O)c1cccc2cnccc12  
3  O=S(=O)(NCC[NH2+]C/C=C/c1ccc(Br)cc1)c1cccc2cnc...  
4                       Nc1ccc2cc3ccc(N)cc3[nH+]c2c1  


[14:19:55] Explicit valence for atom # 11 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 107
[14:19:55] ERROR: Explicit valence for atom # 11 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 5 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 104
[14:19:55] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 12 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 110
[14:19:55] ERROR: Explicit valence for atom # 12 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 13 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule ending on line 107
[14:19:55] ERROR: Explicit valence for atom # 13 N, 4, is greater than permitted
[14:19:55] Explicit valence for atom # 10 N, 4, is greater than permitted
[14:19:55] ERROR: Could not sanitize molecule en

In [3]:
from rdkit import Chem
import pandas as pd
import os
import glob

def mol2_to_smiles(file_path):
    """
    Convert a MOL2 file to a list of SMILES strings.
    
    Args:
        file_path (str): Path to the MOL2 file
        
    Returns:
        list: List of SMILES strings
    """
    # MOL2 files can contain multiple molecules, but RDKit's Mol2MolSupplier
    # handles single molecules. For multiple molecules in one file, we need
    # to handle them differently.
    try:
        # Try to read as a single molecule first
        mol = Chem.MolFromMol2File(file_path)
        if mol is not None:
            return [Chem.MolToSmiles(mol)]
        else:
            # If single molecule read fails, try using Mol2MolSupplier for multi-molecule files
            suppl = Chem.Mol2MolSupplier(file_path)
            return [Chem.MolToSmiles(mol) for mol in suppl if mol is not None]
    except Exception as e:
        print(f"Error reading MOL2 file {file_path}: {e}")
        return []

def process_pdb_folders(csv_file_path, base_directory, output_csv):
    """
    Process PDB folders based on PDB codes from CSV file and extract SMILES strings from MOL2 files.
    
    Args:
        csv_file_path (str): Path to CSV file containing PDB codes
        base_directory (str): Path to directory containing PDB folders
        output_csv (str): Path for output CSV file
        
    Returns:
        pandas.DataFrame: DataFrame containing results
    """
    # Read the CSV file to get PDB codes
    df_pdb = pd.read_csv(csv_file_path)
    pdb_codes = df_pdb['PDB_Code'].tolist()
    
    data = []
    processed_count = 0
    missing_folders = []
    
    print(f"Processing {len(pdb_codes)} PDB codes...")
    
    for pdb_code in pdb_codes:
        pdb_folder_path = os.path.join(base_directory, pdb_code)
        
        # Check if folder exists
        if not os.path.exists(pdb_folder_path):
            missing_folders.append(pdb_code)
            continue
            
        # Find all MOL2 files in the PDB folder
        mol2_files = glob.glob(os.path.join(pdb_folder_path, "*.mol2"))
        
        if not mol2_files:
            print(f"Warning: No MOL2 files found in {pdb_folder_path}")
            continue
            
        for mol2_file in mol2_files:
            try:
                smiles_list = mol2_to_smiles(mol2_file)
                
                # Add each SMILES as a separate row
                for i, smiles in enumerate(smiles_list):
                    data.append({
                        'PDB_Code': pdb_code,
                        'MOL2_File': os.path.basename(mol2_file),
                        'Molecule_Index': i,
                        'SMILES': smiles
                    })
                    
                processed_count += 1
                
            except Exception as e:
                print(f"Error processing {mol2_file}: {e}")
    
    # Create DataFrame and save to CSV
    df_results = pd.DataFrame(data)
    
    if not df_results.empty:
        df_results.to_csv(output_csv, index=False)
        print(f"\nResults saved to {output_csv}")
        print(f"Total SMILES extracted: {len(df_results)}")
        print(f"Processed folders: {processed_count}")
    else:
        print("No SMILES strings were extracted!")
    
    if missing_folders:
        print(f"\nMissing folders for PDB codes: {missing_folders[:10]}...")  # Show first 10
        if len(missing_folders) > 10:
            print(f"... and {len(missing_folders) - 10} more")
    
    return df_results

# Usage example
df1 = process_pdb_folders(
    csv_file_path='pdbind_data_demo.csv',
    base_directory='/Users/bodenretherford/Desktop/demo',  # Update this path
    output_csv='pdb_smiles_mol2.csv'
)
print(df1.head())

Processing 285 PDB codes...
Error reading MOL2 file /Users/bodenretherford/Desktop/demo/2zcq/2zcq_ligand.mol2: module 'rdkit.Chem' has no attribute 'Mol2MolSupplier'
Error reading MOL2 file /Users/bodenretherford/Desktop/demo/2zcr/2zcr_ligand.mol2: module 'rdkit.Chem' has no attribute 'Mol2MolSupplier'
Error reading MOL2 file /Users/bodenretherford/Desktop/demo/4dli/4dli_ligand.mol2: module 'rdkit.Chem' has no attribute 'Mol2MolSupplier'

Results saved to pdb_smiles_mol2.csv
Total SMILES extracted: 282
Processed folders: 285
  PDB_Code         MOL2_File  Molecule_Index  \
0     4tmn  4tmn_ligand.mol2               0   
1     5tmn  5tmn_ligand.mol2               0   
2     1ydr  1ydr_ligand.mol2               0   
3     1ydt  1ydt_ligand.mol2               0   
4     1bcu  1bcu_ligand.mol2               0   

                                              SMILES  
0  CC(C)C[C@H](N[P@@](=O)([O-])[C@H](Cc1ccccc1)N[...  
1  CC(C)C[C@H](NC(=O)[C@H](CC(C)C)N[P@](=O)([O-])...  
2          C[C@

[08:52:19] Can't kekulize mol.  Unkekulized atoms: 1 3 7 12 14 15 16 18 20


In [11]:
mol = pd.read_csv('pdb_smiles_mol2.csv')
bind = pd.read_csv('pdbind_data_demo.csv')


print(mol.head())
print(bind.head())

merged_df = pd.merge(bind, mol, on='PDB_Code', how='left')


print(merged_df.head())
merged_df.to_csv('full.csv', index=False)

  PDB_Code         MOL2_File  Molecule_Index  \
0     4tmn  4tmn_ligand.mol2               0   
1     5tmn  5tmn_ligand.mol2               0   
2     1ydr  1ydr_ligand.mol2               0   
3     1ydt  1ydt_ligand.mol2               0   
4     1bcu  1bcu_ligand.mol2               0   

                                              SMILES  
0  CC(C)C[C@H](N[P@@](=O)([O-])[C@H](Cc1ccccc1)N[...  
1  CC(C)C[C@H](NC(=O)[C@H](CC(C)C)N[P@](=O)([O-])...  
2          C[C@H]1C[NH2+]CCN1S(=O)(=O)c1cccc2cnccc12  
3  O=S(=O)(NCC[NH2+]C/C=C/c1ccc(Br)cc1)c1cccc2cnc...  
4                       Nc1ccc2cc3ccc(N)cc3[nH+]c2c1  
  PDB_Code Binding_Type  Binding_Value Unit Ligand_Name  Binding_Value_nM  \
0     4tmn           Ki          0.068   nM         0PK             0.068   
1     5tmn           Ki          9.100   nM         0PJ             9.100   
2     1ydr           Ki          3.000   uM         IQP          3000.000   
3     1ydt           Ki         48.000   nM         IQB            48.000