In [24]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Function to convert SMARTS to RDKit molecule object
def smarts_to_molecule(smarts):
    mol = Chem.MolFromSmarts(smarts)
    if mol is None:
        raise ValueError(f"Invalid SMARTS string: {smarts}")
    return mol

# Function to calculate molecular descriptors
def calculate_descriptors(mol):
    descriptors = {
        "MolWt": Descriptors.MolWt(mol),
        "TPSA": Descriptors.TPSA(mol),
        "NumAtoms": Descriptors.NumAtoms(mol),
        "NumBonds": Descriptors.NumBonds(mol),
        "HeavyAtomCount": Descriptors.HeavyAtomCount(mol),
        "ExactMolWt": Descriptors.ExactMolWt(mol),
        "MolLogP": Descriptors.MolLogP(mol),
        "MaxPartialCharge": Descriptors.MaxPartialCharge(mol),
        "MinPartialCharge": Descriptors.MinPartialCharge(mol),
    }
    return descriptors

# Sample dataset (replace this with a dataset of known molecules and their CCS)
data = {
    'SMARTS': ['C1=CC=CC=C1', 'CCO', 'CC(C)C(=O)O', 'C1CCCC1'],  # Example SMARTS strings
    'CCS': [250, 150, 200, 180]  # Example CCS values (in Å²)
}

# Create a DataFrame from the example dataset
df = pd.DataFrame(data)

# Extract descriptors for all molecules in the dataset
descriptor_list = []
for smarts in df['SMARTS']:
    mol = smarts_to_molecule(smarts)
    descriptors = calculate_descriptors(mol)
    descriptor_list.append(descriptors)

# Convert descriptor list to DataFrame
descriptor_df = pd.DataFrame(descriptor_list)

# Prepare feature matrix X and target vector y
X = descriptor_df.values
y = df['CCS'].values

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model to predict CCS
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict CCS for the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Function to estimate CCS for a new molecule from SMARTS
def estimate_ccs(smarts):
    mol = smarts_to_molecule(smarts)
    descriptors = calculate_descriptors(mol)
    descriptor_values = np.array(list(descriptors.values())).reshape(1, -1)
    predicted_ccs = model.predict(descriptor_values)
    return predicted_ccs[0]

# Example usage
new_smarts = 'CCO'  # Example SMARTS string for ethanol
estimated_ccs = estimate_ccs(new_smarts)
print(f"Estimated CCS for {new_smarts}: {estimated_ccs:.2f} Å²")



[15:41:20] 

****
Pre-condition Violation
getNumImplicitHs() called without preceding call to calcImplicitValence()
Violation occurred on line 289 in file C:\rdkit\build\temp.win-amd64-cpython-311\Release\rdkit\Code\GraphMol\Atom.cpp
Failed Expression: d_implicitValence > -1
****



RuntimeError: Pre-condition Violation
	getNumImplicitHs() called without preceding call to calcImplicitValence()
	Violation occurred on line 289 in file Code\GraphMol\Atom.cpp
	Failed Expression: d_implicitValence > -1
	RDKIT: 2024.03.6
	BOOST: 1_85
