In [1]:
import pandas as pd


df = pd.read_csv('data/curated dataset/curated_GS_LF_merged_4983.csv')


df['frag_likeliness'] = df['descriptors'].apply(lambda x: 0 if x == 'odorless' else 1)


df_filtered = df[['nonStereoSMILES', 'descriptors', 'frag_likeliness']]
df_filtered = df_filtered.rename(columns={'nonStereoSMILES': 'SMILES'})

df_filtered.to_csv('data/curated dataset/filtered_curated_GS_LF_merged_4983.csv', index=False)


print(df_filtered.head())

             SMILES                                        descriptors  \
0           CC(O)CN                                              fishy   
1     CCC(=O)C(=O)O             fatty;lactonic;sweet;caramellic;creamy   
2  O=C(O)CCc1ccccc1     rose;floral;fatty;sweet;musk;cinnamon;balsamic   
3     OCc1ccc(O)cc1  medicinal;phenolic;fruity;nutty;bitter;sweet;a...   
4    O=Cc1ccc(O)cc1  phenolic;woody;nutty;vanilla;hay;metallic;swee...   

   frag_likeliness  
0                1  
1                1  
2                1  
3                1  
4                1  


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit.Chem import Crippen
from rdkit.Chem.EState import EState_VSA
from rdkit.Chem import rdMolDescriptors
import pandas as pd
from rdkit import Chem
from rdkit.Chem import QED, Crippen, Descriptors, rdMolDescriptors, AllChem
from rdkit.Chem.Descriptors import MolWt
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    properties = {
'QED': QED.qed(mol),
                'logP': Crippen.MolLogP(mol),
                'Molecular Weight': Descriptors.MolWt(mol),
                'Number of Heavy Atoms': Descriptors.HeavyAtomCount(mol),
                'TPSA': Descriptors.TPSA(mol),
                'Rotatable Bonds': Descriptors.NumRotatableBonds(mol),
                'H-bond Donors': Descriptors.NumHDonors(mol),
                'H-bond Acceptors': Descriptors.NumHAcceptors(mol),
                'Ring Count': Descriptors.RingCount(mol),
                'Formal Charge': Chem.GetFormalCharge(mol),
                'Fraction of Sp2 Hybridized Atoms': rdMolDescriptors.CalcFractionCSP3(mol),
                'Number of Aromatic Rings': rdMolDescriptors.CalcNumAromaticRings(mol),
                'Molar Refractivity': Descriptors.MolMR(mol),
                'Rotatable Bond Count': Descriptors.NumRotatableBonds(mol),
                'Number of Heteroatoms': rdMolDescriptors.CalcNumHeteroatoms(mol),
                'Ipc': Descriptors.Ipc(mol),
                'Kappa1': Descriptors.Kappa1(mol),
                'Kappa2': Descriptors.Kappa2(mol),
                'Kappa3': Descriptors.Kappa3(mol),
                'LabuteASA': Descriptors.LabuteASA(mol),
                'PEOE_VSA1': Descriptors.PEOE_VSA1(mol),
                'PEOE_VSA2': Descriptors.PEOE_VSA2(mol),
                'PEOE_VSA3': Descriptors.PEOE_VSA3(mol),
                'SMR_VSA1': Descriptors.SMR_VSA1(mol),
                'SMR_VSA2': Descriptors.SMR_VSA2(mol),
                'SMR_VSA3': Descriptors.SMR_VSA3(mol),
                'SlogP_VSA1': Descriptors.SlogP_VSA1(mol),
                'SlogP_VSA2': Descriptors.SlogP_VSA2(mol),
                'SlogP_VSA3': Descriptors.SlogP_VSA3(mol),
                'FCFP4 Count': len(AllChem.GetMorganFingerprintAsBitVect(mol, 2, useFeatures=True).GetOnBits()),
                'ECFP4 Count': len(AllChem.GetMorganFingerprintAsBitVect(mol, 2, useFeatures=False).GetOnBits()),
                'Num Bridgehead Atoms': rdMolDescriptors.CalcNumBridgeheadAtoms(mol),
                'Num Spiro Atoms': rdMolDescriptors.CalcNumSpiroAtoms(mol),
                'Num Macrocycles': sum(1 for ring in mol.GetRingInfo().AtomRings() if len(ring) > 8),
                'Fsp3': rdMolDescriptors.CalcFractionCSP3(mol),
                'Ring Atom Count': sum(1 for atom in mol.GetAtoms() if atom.IsInRing()),
                'Ring Bond Count': sum(1 for bond in mol.GetBonds() if bond.IsInRing()),
                'Aliphatic Ring Count': rdMolDescriptors.CalcNumAliphaticRings(mol),
                'Aliphatic Heteroatom Count': sum(1 for atom in mol.GetAtoms() if atom.IsInRing() and atom.GetAtomicNum() not in [1, 6]),
                'Aromatic Heteroatom Count': sum(1 for atom in mol.GetAtoms() if atom.GetIsAromatic() and atom.GetAtomicNum() not in [1, 6]),
                'Saturated Ring Count': rdMolDescriptors.CalcNumSaturatedRings(mol),
                'Aromatic Ring Count': rdMolDescriptors.CalcNumAromaticRings(mol),
                'Hetero-Aliphatic Ring Count': rdMolDescriptors.CalcNumAliphaticHeterocycles(mol),
                'Hetero-Aromatic Ring Count': rdMolDescriptors.CalcNumAromaticHeterocycles(mol),
                'Hetero-Saturated Ring Count': rdMolDescriptors.CalcNumSaturatedHeterocycles(mol),
                'Largest Ring Size': max([len(ring) for ring in mol.GetRingInfo().AtomRings()]) if mol.GetRingInfo().AtomRings() else 0,
                'Wildman-Crippen MR': Crippen.MolMR(mol),
                'Average Molecular Weight': Descriptors.MolWt(mol) / mol.GetNumAtoms(),
                'Exact Molecular Weight': Descriptors.ExactMolWt(mol),
                'Atom Pair Fingerprint Count': len(AllChem.GetAtomPairFingerprint(mol).GetNonzeroElements()),                
                'Natural Product Likeness Score': QED.qed(mol),  # Placeholder for NP-likeness
    }
    
    return properties


input_file = 'data/curated dataset/filtered_curated_GS_LF_merged_4983.csv'

df = pd.read_csv(input_file)


smiles_column = 'SMILES'

if smiles_column not in df.columns:
    print(f"Error: '{smiles_column}' column not found in the CSV file.")
    print("Available columns:", df.columns.tolist())
    exit(1)

data = []
for index, row in df.iterrows():
    smiles = row[smiles_column]
    properties = calculate_properties(smiles)
    if properties:
        
        for col in df.columns:
            properties[col] = row[col]
        data.append(properties)
    
    
    if (index + 1) % 100 == 0:
        print(f"Processed {index + 1} molecules...")


result_df = pd.DataFrame(data)
columns = result_df.columns.tolist()
columns.remove(smiles_column)
columns = [smiles_column] + columns
result_df = result_df[columns]

output_file = 'data/curated dataset/curated_dataset_molecular_properties.xlsx'
result_df.to_excel(output_file, index=False)
print(f"Properties calculated and saved to {output_file}")

Processed 100 molecules...
Processed 200 molecules...
Processed 300 molecules...
Processed 400 molecules...
Processed 500 molecules...
Processed 600 molecules...
Processed 700 molecules...
Processed 800 molecules...
Processed 900 molecules...
Processed 1000 molecules...
Processed 1100 molecules...
Processed 1200 molecules...
Processed 1300 molecules...
Processed 1400 molecules...
Processed 1500 molecules...
Processed 1600 molecules...
Processed 1700 molecules...
Processed 1800 molecules...
Processed 1900 molecules...
Processed 2000 molecules...
Processed 2100 molecules...
Processed 2200 molecules...
Processed 2300 molecules...
Processed 2400 molecules...
Processed 2500 molecules...
Processed 2600 molecules...
Processed 2700 molecules...
Processed 2800 molecules...
Processed 2900 molecules...
Processed 3000 molecules...
Processed 3100 molecules...
Processed 3200 molecules...
Processed 3300 molecules...
Processed 3400 molecules...
Processed 3500 molecules...
Processed 3600 molecules...
P