In [9]:
import os
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


RDF (Radial Distribution Function): RDF descriptors capture information about the distribution of atoms or molecular fragments around a central atom or point in a molecule. They provide insights into the molecular shape and connectivity.

In [None]:
# Path to the folder containing your SDF files
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated RDF descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/rdf_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,RDF\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating RDF"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate RDF descriptors
            rdf_descriptors = rdMolDescriptors.CalcRDF(mol)
            
            # Remove the ".sdf" extension from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and RDF descriptor value to the output file
            f.write(f"{file_number},{rdf_descriptors}\n")

Morse: Morse descriptors are based on the Morse potential, which describes the interaction between atoms in a molecule. These descriptors capture information about bond energies, bond lengths, and atomic interactions.

In [2]:
# Path to the folder containing your SDF files
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated Morse descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/morse_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,Morse\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating Morse"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate Morse descriptors
            morse_descriptors = rdMolDescriptors.CalcMORSE(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and Morse descriptor value to the output file
            f.write(f"{file_number},{morse_descriptors}\n")

Calculating Morse: 100%|██████████| 136594/136594 [05:40<00:00, 401.04it/s]


WHIM (Weighted Holistic Invariant Molecular): WHIM descriptors are holistic descriptors that encode information about the 3D structure, shape, and symmetry of a molecule. They are derived from the eigenvalues of a weighted matrix representing the molecule's geometry.

In [None]:
# Path to the folder containing your SDF files
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated WHIM descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/whim_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,WHIM\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating WHIM"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate WHIM descriptors
            whim_descriptors = rdMolDescriptors.CalcWHIM(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and WHIM descriptor values to the output file
            f.write(f"{file_number},{whim_descriptors}\n")

Autocorr3D (3D Autocorrelation): Autocorr3D descriptors quantify the spatial arrangement of atoms or molecular properties in three dimensions. They provide information about spatial patterns and local environments within a molecule.

In [3]:
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated Autocorr3D descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/autocorr3d_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,Autocorr3D\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating Autocorr3D"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate Autocorr3D descriptors
            autocorr3d_descriptors = rdMolDescriptors.CalcAUTOCORR3D(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and Autocorr3D descriptor values to the output file
            f.write(f"{file_number},{autocorr3d_descriptors}\n")

Calculating Autocorr3D: 100%|██████████| 136594/136594 [04:41<00:00, 485.95it/s]


GETAWAY (Geometrical Maximum Autocorrelation): GETAWAY descriptors capture geometric and topological information about a molecule's structure. They are derived from autocorrelation matrices and provide insights into molecular shape and connectivity.

In [5]:
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated GETAWAY descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/getaway_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,GETAWAY\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating GETAWAY"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate GETAWAY descriptors
            getaway_descriptors = rdMolDescriptors.CalcGETAWAY(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and GETAWAY descriptor values to the output file
            f.write(f"{file_number},{getaway_descriptors}\n")


Calculating GETAWAY: 100%|██████████| 136594/136594 [11:56<00:00, 190.70it/s]


PEOE (Partial Equalization of Orbital Electronegativity): PEOE descriptors are based on the concept of equalizing orbital electronegativity in a molecule. They capture information about charge distribution, electronegativity, and polarizability.

In [6]:
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated PEOE descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/peoe_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,PEOE\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating PEOE"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate PEOE descriptors
            peoe_descriptors = rdMolDescriptors.PEOE_VSA_(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and PEOE descriptor values to the output file
            f.write(f"{file_number},{peoe_descriptors}\n")

Calculating PEOE: 100%|██████████| 136594/136594 [08:37<00:00, 263.97it/s]


SMR (Surface Molecular Representation): SMR descriptors quantify the molecular surface area and volume of a molecule. They provide information about molecular size, shape, and surface properties.

In [7]:
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated SMR descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/smr_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,SMR\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating SMR"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate SMR descriptors
            smr_descriptors = rdMolDescriptors.SMR_VSA_(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and SMR descriptor values to the output file
            f.write(f"{file_number},{smr_descriptors}\n")

Calculating SMR: 100%|██████████| 136594/136594 [14:09<00:00, 160.85it/s]


MW (Molecular Weight): MW descriptors simply represent the molecular weight of a molecule, which is the sum of the atomic weights of all atoms in the molecule.

In [10]:
folder_path = "/home/antonio98/Desktop/Projeto_em_Bioquímica/Training_set"

# Output file to save the calculated MW descriptors
output_file = "/home/antonio98/Desktop/Projeto_em_Bioquímica/MW_descriptors.csv"

# Open the output file in write mode
with open(output_file, 'w') as f:
    # Write the header containing column names
    f.write("File,MW\n")
    
    # Iterate over each SDF file in the folder
    for file_name in tqdm(os.listdir(folder_path), desc="Calculating MW"):
        if file_name.endswith(".sdf"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the SDF file and retrieve the molecule
            suppl = Chem.SDMolSupplier(file_path)
            mol = suppl[0]  # Assuming there is only one molecule in each SDF file
            
            # Calculate MW descriptor
            mw_descriptor = Descriptors.MolWt(mol)
            
            # Extract the number from the file name
            file_number = file_name.split("_")[-1].split(".")[0]
            
            # Write the file number and MW descriptor value to the output file
            f.write(f"{file_number},{mw_descriptor}\n")

Calculating MW: 100%|██████████| 136594/136594 [06:27<00:00, 352.68it/s]
