In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np

In [2]:
# Example data loading
data = pd.read_csv('synthetic_data1.csv')  # Make sure you have a dataset with SMILES and target property, e.g., solubility

In [3]:
# Preprocess and compute descriptors
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return [Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol)]
    return [np.nan]*4

data[['MW', 'LogP', 'HAcceptors', 'HDonors']] = data['SMILES'].apply(lambda x: pd.Series(compute_descriptors(x)))
data.dropna(inplace=True)

[12:53:11] Explicit valence for atom # 1 Br, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 3 Br, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 1 Br, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 4 Br, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 1 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 1 Br, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 3 Br, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 4 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 4 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[12:53:11] Explicit valence for atom #

In [4]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

X = data[['MW', 'LogP', 'HAcceptors', 'HDonors']]
y = data['Solubility']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()
model.fit(X_train, y_train)


predictions = model.predict(X_test)
data['Predicted_Solubility'] = model.predict(data[['MW', 'LogP', 'HAcceptors', 'HDonors']])

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
from ipywidgets import interact, fixed
from rdkit.Chem import Draw

def visualize_molecule(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Draw.MolToImage(mol)

@interact
def show_molecules_above_threshold(threshold=(data['Predicted_Solubility'].min(), data['Predicted_Solubility'].max())):
    filtered_data = data[data['Predicted_Solubility'] > threshold]
    smiles_list = filtered_data['SMILES'].tolist()
    for smiles in smiles_list:
        display(visualize_molecule(smiles))


interactive(children=(FloatSlider(value=94.81521892547607, description='threshold', max=174.6139373779297, min…