<p style="font-family: Monospace; font-size: 20px; color: black;">
    This script processes a CSV file containing SMILES strings, normalizes them into their canonical form using a cheminformatics library (e.g., RDKit), removes invalid or erroneous records, and saves the cleaned and standardized results to a new CSV file for further use in molecular analysis or machine learning tasks.
</p>


<p style="font-family: Monospace; font-size: 20px; color: black;">
    1.1 SMILES Standardization
</p>

In [10]:
import pandas as pd
from rdkit import Chem

# Read the input CSV file
input_file = "input_smiles.csv"  # Path to the input CSV file
output_file = "canonical_smiles_output.csv"  # Path to the output CSV file
smiles_column = "SMILES"  # Name of the column containing SMILES strings

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file)

# Check if the specified SMILES column exists in the DataFrame
if smiles_column not in df.columns:
    raise ValueError(f"Column '{smiles_column}' not found in the CSV file.")

# List to store canonical SMILES
canonical_smiles_list = []

# Process each SMILES string in the specified column
for smile in df[smiles_column]:
    try:
        # Convert the SMILES string to an RDKit molecule object
        mol = Chem.MolFromSmiles(smile)
        if mol is not None:
            # Generate the canonical SMILES representation
            canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
            canonical_smiles_list.append(canonical_smiles)
        else:
            # Handle invalid SMILES
            print(f"Invalid SMILES: {smile}")
            canonical_smiles_list.append(None)  # Use None as a placeholder for invalid SMILES
    except Exception as e:
        # Handle any unexpected errors during processing
        print(f"Error processing SMILES: {smile}, Error: {e}")
        canonical_smiles_list.append(None)  # Use None as a placeholder for errors

# Add the canonical SMILES as a new column in the DataFrame
df["Canonical_SMILES"] = canonical_smiles_list

# Save the updated DataFrame to a new CSV file
df.to_csv(output_file, index=False)

print(f"Canonical SMILES have been saved to: {output_file}")


规范化后的 SMILES:
O=C(/C=C1/CCc2ccccc2N1)c1ccccc1
O=C(/C=C1/CCc2ccccc2N1)c1ccccc1
O=C(/C=C1/CCc2ccccc2N1)c1ccccc1


<p style="font-family: Monospace; font-size: 20px; color: black;">
    1.2 Data Cleaning for Identical Reactant-Product Records
</p>

In [11]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('aho_dataset.csv')

# Check if Reactant SMILES and Product SMILES are identical
same_records = df[df['Reactant SMILES'] == df['Product SMILES']]

# Save these identical records to a new CSV file
same_records.to_csv('same_reactant_product_records.csv', index=False)

# Remove records where Reactant SMILES and Product SMILES are identical
df_filtered = df[df['Reactant SMILES'] != df['Product SMILES']]

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_aho_dataset.csv', index=False)

print("Records where Reactant SMILES and Product SMILES are identical have been saved to 'same_reactant_product_records.csv'.")
print("Records where Reactant SMILES and Product SMILES are identical have been removed and saved to 'filtered_aho_dataset.csv'.")


Records where Reactant SMILES and Product SMILES are identical have been saved to 'same_reactant_product_records.csv'.


<p style="font-family: Monospace; font-size: 20px; color: black;">
    1.3 Data cleaning for records of lost stereochemical information of products 
</p>

In [None]:
# Identify chirality
from rdkit import Chem

file_path = 'train_output_file.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Function to identify chiral centers in a molecule
def identify_chiral_centers(smiles):
    mol = Chem.MolFromSmiles(smiles)
    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
    result = ','.join(map(str, chiral_centers))
    return result

# Function to process and apply chirality identification for the 'Target_pred' column
def process_and_apply(row):
    try:
        column1_value = row['Target_pred']
        result = identify_chiral_centers(column1_value)
        return result
    except Exception as e:
        return 'False'

# Function to process and apply chirality identification for the 'Product SMILES' column
def TARGET_and_apply(row):
    try:
        column2_value = row['Product SMILES']
        result = identify_chiral_centers(column2_value)
        return result
    except Exception as e:
        return 'False'

# Apply the functions to the DataFrame and create new columns
df['Pred'] = df.apply(lambda row: process_and_apply(row), axis=1)
df['TARGET'] = df.apply(lambda row: TARGET_and_apply(row), axis=1)

# Save the results to a new CSV file
output_file_path = 'train_output_file.csv'  # Replace with your desired output file path
df.to_csv(output_file_path, index=False)

print(f"Processing completed. Results have been saved to {output_file_path}")

In [None]:
# Remove missing values
import pandas as pd

# Open the CSV file
file_path = 'train_output_file.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Drop rows where the 'TARGET' column is empty
df = df.dropna(subset=['TARGET'])

# Drop rows where the 'TARGET' column contains "?"
df = df[~df['TARGET'].str.contains('\?', regex=True)]

# Save the results to a new CSV file
output_file_path = 'train_last.csv'  # Replace with your desired output file path
df.to_csv(output_file_path, index=False)

print(f"Processing completed. Results have been saved to {output_file_path}")

<p style="font-family: Monospace; font-size: 20px; color: black;">
    1.4 Keep only one C=C double bond
</p>

In [None]:
# Keep only one C=C double bond
import pandas as pd
from rdkit import Chem

# Open the CSV file
file_path = 'train_output_file.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Define a function to count the number of C=C double bonds
def count_c_c_double_bonds(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0  # Return 0 if the SMILES is invalid
    count = 0
    for bond in mol.GetBonds():
        if bond.GetBondType() == Chem.rdchem.BondType.DOUBLE:
            atom1 = bond.GetBeginAtom()
            atom2 = bond.GetEndAtom()
            if atom1.GetAtomicNum() == 6 and atom2.GetAtomicNum() == 6:  # Ensure it's a C=C double bond
                count += 1
    return count

# Calculate the number of C=C double bonds for each row
df['C_C_Double_Bonds'] = df['Reactant SMILES'].apply(count_c_c_double_bonds)

# Remove rows where the number of C=C double bonds is greater than 1
df_filtered = df[df['C_C_Double_Bonds'] <= 1]

# Drop the auxiliary column and save the results to a new CSV file
df_filtered = df_filtered.drop(columns=['C_C_Double_Bonds'])
output_file_path = 'train_output_file.csv'  # Replace with your desired output file path
df_filtered.to_csv(output_file_path, index=False)

print(f"Processing completed. Results have been saved to {output_file_path}")