Description

This script performs the following tasks:

    Reads a CSV file containing a column named "smiles".
    Defines a function to calculate various molecular properties using RDKit.
    Iterates over the DataFrame with a progress bar to calculate properties for each SMILES string.
    Adds new columns to the DataFrame to store the calculated properties.
    Saves the updated DataFrame to a new CSV file.


Instructions for Use

    Ensure you have the required libraries installed:

'''
        pip install pandas rdkit tqdm
'''

Update the input_file_path variable with the path to your input CSV file.

Run the script. The script will read the input CSV file, process the SMILES strings to calculate molecular properties, and save the results to a new CSV file specified by output_file_path.


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, QED
from tqdm import tqdm

# Function to calculate molecular properties from SMILES
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        properties = {
            'SlogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'Exact_MW': Descriptors.ExactMolWt(mol),
            'Num_Rotatable_Bonds': Descriptors.NumRotatableBonds(mol),
            'Num_HBD': Descriptors.NumHDonors(mol),
            'Num_HBA': Descriptors.NumHAcceptors(mol),
            'Num_Hetero_Atoms': Descriptors.NumHeteroatoms(mol),
            'Num_Heavy_Atoms': Descriptors.HeavyAtomCount(mol),
            'Num_Atoms': mol.GetNumAtoms(),
            'Num_StereoCenters': len(Chem.FindMolChiralCenters(mol, includeUnassigned=True)),
            'Num_Aromatic_Rings': Descriptors.NumAromaticRings(mol),
            'Num_Saturated_Rings': Descriptors.NumSaturatedRings(mol),
            'Num_Aliphatic_Rings': Descriptors.NumAliphaticRings(mol),
            'InChIKey': Chem.inchi.MolToInchiKey(mol),
            'MolecularFormula': Chem.rdMolDescriptors.CalcMolFormula(mol),
            'carbon_count': mol.GetNumAtoms(6),
            'CX_LogP': Crippen.MolLogP(mol),
            'CX_LogD': Crippen.MolMR(mol),
            'Heavy_Atoms': Descriptors.HeavyAtomCount(mol),
            'qed_score': QED.qed(mol),
            'Alcohol': len(mol.GetSubstructMatches(Chem.MolFromSmarts('[OH]'))),
            'Aldehyde': len(mol.GetSubstructMatches(Chem.MolFromSmarts('[CX3]=[OX1]'))),
            'Carboxylic Acid': len(mol.GetSubstructMatches(Chem.MolFromSmarts('[CX3](=O)[OX2H1]'))),
            'Amine': len(mol.GetSubstructMatches(Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]'))),
            'Thiol': len(mol.GetSubstructMatches(Chem.MolFromSmarts('[SX2H1]'))),
        }
        return properties
    else:
        return None

# Read CSV file with a column named "smiles"
input_file_path = 'input.csv'  # Update with your input file path
df = pd.read_csv(input_file_path)

# Iterate over the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df)):
    smiles = row['smiles']
    if isinstance(smiles, str):  # Check if the SMILES string is valid
        properties = calculate_properties(smiles)
        if properties is not None:
            for key, value in properties.items():
                df.at[index, key] = value

# Save the updated DataFrame to a new CSV file
output_file_path = "output.csv"  # Update with your desired output file path
df.to_csv(output_file_path, index=False)

print(f"Updated data with molecular properties saved to '{output_file_path}'.")
