In [3]:
# Install RDKit from conda-forge via pip
!pip install rdkit py3Dmol pandas scikit-learn tqdm torch torchvision torchaudio torch-geometric kagglehub


Collecting rdkit
  Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting py3Dmol
  Downloading py3dmol-2.5.2-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting torch-geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl (36.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading py3dmol-2.5.2-py2.py3-none-any.whl (7.2 kB)
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: py3Dmol, rdkit, torch-geometric
Successfully installed py3Dmol-2.5.2 rdkit-2025.3.6 torch-geometric-2.6.1


In [5]:
import kagglehub, os

# Download QM40 dataset
path = kagglehub.dataset_download("nikitamanaenkov/qm40-molecular-qm-dataset")
print("Dataset path:", path)
os.listdir(path)  # should show main.csv, xyz.csv, bond.csv


Downloading from https://www.kaggle.com/api/v1/datasets/download/nikitamanaenkov/qm40-molecular-qm-dataset?dataset_version_number=1...


100%|██████████| 319M/319M [00:02<00:00, 143MB/s]

Extracting files...





Dataset path: /root/.cache/kagglehub/datasets/nikitamanaenkov/qm40-molecular-qm-dataset/versions/1


['main.csv', 'bond.csv', 'xyz.csv']

In [6]:
import pandas as pd

main_fn = os.path.join(path, "main.csv")
xyz_fn  = os.path.join(path, "xyz.csv")
bond_fn = os.path.join(path, "bond.csv")

df_main = pd.read_csv(main_fn)
df_xyz  = pd.read_csv(xyz_fn)
df_bond = pd.read_csv(bond_fn)

print("main.csv columns:", df_main.columns.tolist())
print("xyz.csv columns:", df_xyz.columns.tolist())
print("bond.csv columns:", df_bond.columns.tolist())


main.csv columns: ['Zinc_id', 'smile', 'Internal_E(0K)', 'HOMO', 'LUMO', 'HL_gap', 'Polarizability', 'spatial extent', 'dipol_mom', 'ZPE', 'rot1', 'rot2', 'rot3', 'Inter_E(298)', 'Enthalpy', 'Free_E', 'CV', 'Entropy']
xyz.csv columns: ['Zinc_id', 'smile', 'atom', 'init_x', 'init_y', 'init_z', 'final_x', 'final_y', 'final_z', 'charge']
bond.csv columns: ['Zinc_id', 'smile', 'atom1', 'atom2', 'bond', 'tag', 'lmod']


In [7]:
periodic = {'H':1,'C':6,'N':7,'O':8,'F':9,'Si':14,'P':15,'S':16,'Cl':17,'Ti':22,'Fe':26,'Cu':29,'Zn':30}
def element_to_Z(e):
    return periodic.get(e.capitalize(), 0)


In [9]:
import torch
from torch_geometric.data import Data
from tqdm import tqdm

MAX_MOLS = 500  # reduce for quick run

mol_ids = df_main['Zinc_id'].unique().tolist()[:MAX_MOLS]
id_to_target = df_main.set_index('Zinc_id')['dipol_mom'].to_dict()  # change property if needed

data_list = []
skipped = 0
for mol_id in tqdm(mol_ids):
    coords = df_xyz[df_xyz['Zinc_id']==mol_id]
    bonds = df_bond[df_bond['Zinc_id']==mol_id]

    # atoms
    coords = coords.sort_values('atom')
    elements = coords['atom'].tolist()
    Z = [element_to_Z(e) for e in elements]
    if any(z==0 for z in Z):
        skipped += 1
        continue
    x = torch.tensor([[z] for z in Z], dtype=torch.float)

    # edges
    if not bonds.empty:
        idx_map = {ai: i for i, ai in enumerate(coords['atom'].tolist())}
        rows, cols, edge_attr = [], [], []
        for _, r in bonds.iterrows():
            a1, a2 = int(r['atom1']), int(r['atom2'])
            if a1 in idx_map and a2 in idx_map:
                i, j = idx_map[a1], idx_map[a2]
                rows += [i,j]; cols += [j,i]
                bo = float(r['bond']) if 'bond' in r.index else 1.0
                edge_attr += [[bo],[bo]]
        edge_index = torch.tensor([rows, cols], dtype=torch.long)
        edge_attr  = torch.tensor(edge_attr, dtype=torch.float)
    else:
        n = x.size(0)
        rows = [i for i in range(n) for j in range(n) if i!=j]
        cols = [j for i in range(n) for j in range(n) if i!=j]
        edge_index = torch.tensor([rows, cols], dtype=torch.long)
        edge_attr = None

    if mol_id not in id_to_target:
        skipped += 1
        continue
    y = torch.tensor([id_to_target[mol_id]], dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    data.mol_id = mol_id
    data.atom_elements = elements
    data.xyz = coords[['final_x','final_y','final_z']].to_numpy()
    data_list.append(data)

print("Graphs built:", len(data_list), "Skipped:", skipped)

100%|██████████| 500/500 [09:40<00:00,  1.16s/it]

Graphs built: 500 Skipped: 0





In [10]:
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(list(range(len(data_list))), test_size=0.2, random_state=42)
train_list = [data_list[i] for i in train_idx]
val_list   = [data_list[i] for i in val_idx]

train_loader = DataLoader(train_list, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_list, batch_size=32)
print("Train/Val sizes:", len(train_list), len(val_list))


Train/Val sizes: 400 100




In [11]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_add_pool

class SimpleGNN(nn.Module):
    def __init__(self, in_dim, hidden=64):
        super().__init__()
        nn1 = nn.Sequential(nn.Linear(in_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden))
        self.conv1 = GINConv(nn1)
        nn2 = nn.Sequential(nn.Linear(hidden, hidden), nn.ReLU(), nn.Linear(hidden, hidden))
        self.conv2 = GINConv(nn2)
        self.lin1 = nn.Linear(hidden, hidden//2)
        self.lin2 = nn.Linear(hidden//2, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        batch = data.batch if hasattr(data, 'batch') else torch.zeros(x.size(0), dtype=torch.long, device=x.device)
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_add_pool(x, batch)
        x = F.relu(self.lin1(x))
        return self.lin2(x).view(-1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleGNN(in_dim=data_list[0].x.shape[1]).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
loss_fn = nn.L1Loss()
model


SimpleGNN(
  (conv1): GINConv(nn=Sequential(
    (0): Linear(in_features=1, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ))
  (conv2): GINConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ))
  (lin1): Linear(in_features=64, out_features=32, bias=True)
  (lin2): Linear(in_features=32, out_features=1, bias=True)
)

In [13]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_add_pool

def evaluate(loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            out = model(batch)
            preds.append(out.cpu())
            trues.append(batch.y.view(-1).cpu())
    preds = torch.cat(preds).numpy()
    trues = torch.cat(trues).numpy()
    return np.mean(np.abs(preds - trues))  # MAE

n_epochs = 10
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        opt.zero_grad()
        out = model(batch)
        loss = loss_fn(out, batch.y.view(-1))
        loss.backward()
        opt.step()
        total_loss += loss.item() * batch.num_graphs
    train_mae = evaluate(train_loader)
    val_mae   = evaluate(val_loader)
    print(f"Epoch {epoch} | Loss: {total_loss/len(train_list):.4f} | Train MAE: {train_mae:.4f} | Val MAE: {val_mae:.4f}")

Epoch 1 | Loss: 1.1541 | Train MAE: 1.1515 | Val MAE: 1.2990
Epoch 2 | Loss: 1.1370 | Train MAE: 1.1212 | Val MAE: 1.2740
Epoch 3 | Loss: 1.1462 | Train MAE: 1.1379 | Val MAE: 1.2919
Epoch 4 | Loss: 1.1359 | Train MAE: 1.1191 | Val MAE: 1.2741
Epoch 5 | Loss: 1.1334 | Train MAE: 1.1371 | Val MAE: 1.2880
Epoch 6 | Loss: 1.1473 | Train MAE: 1.1347 | Val MAE: 1.2872
Epoch 7 | Loss: 1.1303 | Train MAE: 1.1161 | Val MAE: 1.2722
Epoch 8 | Loss: 1.1289 | Train MAE: 1.1226 | Val MAE: 1.2738
Epoch 9 | Loss: 1.1209 | Train MAE: 1.1708 | Val MAE: 1.3225
Epoch 10 | Loss: 1.1528 | Train MAE: 1.1318 | Val MAE: 1.2881


In [14]:
import py3Dmol
samples = [data_list[i] for i in np.random.choice(len(data_list), size=min(5,len(data_list)), replace=False)]

for d in samples:
    d_batch = d.to(device)
    with torch.no_grad():
        pred = model(d_batch.unsqueeze(0) if hasattr(d_batch, 'unsqueeze') else d_batch).item()
    print(f"mol_id: {d.mol_id} | atoms: {'-'.join(d.atom_elements)} | predicted dipole_moment: {pred:.4f}")

    xyz_lines = [f"{el} {x:.6f} {y:.6f} {z:.6f}" for el, (x,y,z) in zip(d.atom_elements, d.xyz)]
    xyz_block = f"{len(d.atom_elements)}\n\n" + "\n".join(xyz_lines)
    view = py3Dmol.view(width=350, height=250)
    view.addModel(xyz_block, 'xyz')
    view.setStyle({'stick':{}})
    view.zoomTo()
    display(view)


mol_id: ZINC000346309993 | atoms: C-C-C-C-C-C-C-C-C-C-C-C-C-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-N-N-N-N-N-S | predicted dipole_moment: 3.8306


<py3Dmol.view at 0x7f010fac9d00>

mol_id: ZINC000633145467 | atoms: C-C-C-C-C-C-C-C-C-C-C-F-F-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-N-N-N-O-S | predicted dipole_moment: 3.5525


<py3Dmol.view at 0x7f010f996b10>

mol_id: ZINC000634040467 | atoms: C-C-C-C-C-C-C-C-C-C-C-C-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-N-N-O-O-S | predicted dipole_moment: 3.4451


<py3Dmol.view at 0x7f010fac9d00>

mol_id: ZINC000348595730 | atoms: C-C-C-C-C-C-C-C-C-C-C-C-C-C-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-N-N-O-O | predicted dipole_moment: 3.6447


<py3Dmol.view at 0x7f01fecdef60>

mol_id: ZINC000633835723 | atoms: C-C-C-C-C-C-C-C-C-C-C-C-Cl-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-H-N-N-O-O | predicted dipole_moment: 3.4214


<py3Dmol.view at 0x7f010f35ad80>

In [15]:
!pip install selfies


Collecting selfies
  Downloading selfies-2.2.0-py3-none-any.whl.metadata (14 kB)
Downloading selfies-2.2.0-py3-none-any.whl (36 kB)
Installing collected packages: selfies
Successfully installed selfies-2.2.0


In [19]:
import selfies as sf
import pandas as pd
from tqdm import tqdm
import os

# Load molecules from your dataset (main.csv)
main_fn = os.path.join(path, "main.csv")
df_main = pd.read_csv(main_fn)

# Convert SMILES → SELFIES
selfies_list = []

print("Columns in df_main:", df_main.columns.tolist()) # Add this line to check columns

if 'smile' in df_main.columns:
    for smi in tqdm(df_main['smile'][:5000]):  # limit for quick training
        try:
            sfs = sf.encoder(smi)
            if sfs:
                selfies_list.append(sfs)
        except Exception as e:
            print(f"Could not encode SMILES '{smi}': {e}")
            continue
else:
    print("Error: 'smile' column not found in df_main.")

print("Number of SELFIES:", len(selfies_list))

Columns in df_main: ['Zinc_id', 'smile', 'Internal_E(0K)', 'HOMO', 'LUMO', 'HL_gap', 'Polarizability', 'spatial extent', 'dipol_mom', 'ZPE', 'rot1', 'rot2', 'rot3', 'Inter_E(298)', 'Enthalpy', 'Free_E', 'CV', 'Entropy']


100%|██████████| 5000/5000 [00:01<00:00, 4864.68it/s]

Number of SELFIES: 5000





In [22]:
import torch
import selfies as sf

# Build vocabulary
all_tokens = set()
for s in selfies_list:
    all_tokens.update(sf.split_selfies(s))

token2idx = {t:i+1 for i,t in enumerate(sorted(all_tokens))}
token2idx['<PAD>'] = 0
idx2token = {i:t for t,i in token2idx.items()}

# Convert SELFIES → indices
max_len = max(len(list(sf.split_selfies(s))) for s in selfies_list)
def selfies_to_tensor(s):
    tokens = sf.split_selfies(s)
    ids = [token2idx[t] for t in tokens]
    ids += [0]*(max_len - len(ids))
    return torch.tensor(ids, dtype=torch.long)

selfies_tensor = torch.stack([selfies_to_tensor(s) for s in selfies_list])
print("Tensor shape:", selfies_tensor.shape)

Tensor shape: torch.Size([5000, 39])


In [23]:
import torch.nn as nn

class SELFIES_RNN(nn.Module):
    def __init__(self, vocab_size, emb_size=128, hidden_size=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.rnn = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        emb = self.embedding(x)
        out, hidden = self.rnn(emb, hidden)
        out = self.fc(out)
        return out, hidden

vocab_size = len(token2idx)
rnn_model = SELFIES_RNN(vocab_size).to("cuda" if torch.cuda.is_available() else "cpu")


In [24]:
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
dataset = TensorDataset(selfies_tensor[:, :-1], selfies_tensor[:, 1:])
loader = DataLoader(dataset, batch_size=64, shuffle=True)
opt = torch.optim.Adam(rnn_model.parameters(), lr=1e-3)
n_epochs = 5  # increase for better molecules

for epoch in range(n_epochs):
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        out, _ = rnn_model(x)
        loss = F.cross_entropy(out.view(-1, vocab_size), y.view(-1))
        loss.backward()
        opt.step()
        total_loss += loss.item() * x.size(0)
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader.dataset):.4f}")


Epoch 1, Loss: 1.8896
Epoch 2, Loss: 1.2572
Epoch 3, Loss: 1.0320
Epoch 4, Loss: 0.9367
Epoch 5, Loss: 0.8750


In [25]:
import random

def sample_selfies(model, max_len=20, temperature=1.0):
    model.eval()
    x = torch.tensor([[0]], device=device)  # start token <PAD>
    hidden = None
    tokens = []
    for _ in range(max_len):
        out, hidden = model(x, hidden)
        probs = F.softmax(out[:, -1, :]/temperature, dim=-1)
        idx = torch.multinomial(probs, num_samples=1).item()
        if idx == 0: break
        tokens.append(idx2token[idx])
        x = torch.tensor([[idx]], device=device)
    try:
        smi = sf.decoder(''.join(tokens))
        return smi
    except:
        return None

generated_smiles = [sample_selfies(rnn_model) for _ in range(20)]
generated_smiles = [s for s in generated_smiles if s]
print("Generated SMILES:", generated_smiles)


Generated SMILES: ['[NH1+1]CCOC=O', '[NH1+1]#C', 'S\\Cl', 'O1[NH1][C@H1]1NC=C[C@@H1]CO', '[C@@H1][C@@H1](N)C=C1[C@@](Cl)C=CC1NC=N', 'CCNCC[C@@H1](C)N[C@@H1]C', 'Cl', '[NH1+1][S@]CC(C)CCNC=CC=CC', 'CC=NC=C(O)C(CC=NOCC)=C', 'COC=O', 'O=[C@@H1]CCC1=CN[C@](C)=C1CO', 'N[C@]=C=O', 'O=C(COC)C1CC1C=N', 'C[C@H1]C[C@@H1]N[C@@H1]C1[C@@H1](C)OC1C[C@H1]', '[N-1]Cl', 'O=O', '[N@@H1+1]SCC(=O)N[C@@H1]S[C@H1](C)C[C@H1](O)C', 'S[N+1][NH1+1]=CNC(=O)[C@H1](C)C=NC=C', '[C@H1]']


In [28]:
candidate_graphs = [smiles_to_graph(s) for s in generated_smiles]
candidate_graphs = [g for g in candidate_graphs if g is not None]


In [27]:
from rdkit import Chem
from torch_geometric.data import Data
import torch

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # atoms
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    x = torch.tensor([[z] for z in atoms], dtype=torch.float)

    # bonds
    row, col, edge_type = [], [], []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        row += [start, end]
        col += [end, start]
        edge_type += [bond.GetBondTypeAsDouble()] * 2

    edge_index = torch.tensor([row, col], dtype=torch.long)
    edge_attr = torch.tensor(edge_type, dtype=torch.float).unsqueeze(1)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    return data

In [29]:
model.eval()  # your trained GNN
predictions = []
for g in candidate_graphs:
    g = g.to(device)
    with torch.no_grad():
        pred = model(g.unsqueeze(0) if hasattr(g, 'unsqueeze') else g).item()
    predictions.append(pred)

for s, p in zip(generated_smiles, predictions):
    print(f"SMILES: {s} | Predicted property: {p:.4f}")


SMILES: [NH1+1]CCOC=O | Predicted property: 5.9374
SMILES: [NH1+1]#C | Predicted property: 1.0612
SMILES: S\Cl | Predicted property: 2.6301
SMILES: O1[NH1][C@H1]1NC=C[C@@H1]CO | Predicted property: 10.9060
SMILES: [C@@H1][C@@H1](N)C=C1[C@@](Cl)C=CC1NC=N | Predicted property: 16.6439
SMILES: CCNCC[C@@H1](C)N[C@@H1]C | Predicted property: 10.2053
SMILES: Cl | Predicted property: 0.3573
SMILES: [NH1+1][S@]CC(C)CCNC=CC=CC | Predicted property: 14.9064
SMILES: CC=NC=C(O)C(CC=NOCC)=C | Predicted property: 15.3165
SMILES: COC=O | Predicted property: 3.6464
SMILES: O=[C@@H1]CCC1=CN[C@](C)=C1CO | Predicted property: 14.2145
SMILES: N[C@]=C=O | Predicted property: 3.4307
SMILES: O=C(COC)C1CC1C=N | Predicted property: 12.1197
SMILES: C[C@H1]C[C@@H1]N[C@@H1]C1[C@@H1](C)OC1C[C@H1] | Predicted property: 15.3108
SMILES: [N-1]Cl | Predicted property: 1.9244
SMILES: O=O | Predicted property: 1.2967
SMILES: [N@@H1+1]SCC(=O)N[C@@H1]S[C@H1](C)C[C@H1](O)C | Predicted property: 18.9041
SMILES: S[N+1][NH1+1]

In [30]:
from rdkit import Chem

def is_chemically_valid(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

valid_smiles = [s for s in generated_smiles if is_chemically_valid(s)]
print("Chemically valid molecules:", valid_smiles)


Chemically valid molecules: ['[NH1+1]CCOC=O', '[NH1+1]#C', 'S\\Cl', 'O1[NH1][C@H1]1NC=C[C@@H1]CO', '[C@@H1][C@@H1](N)C=C1[C@@](Cl)C=CC1NC=N', 'CCNCC[C@@H1](C)N[C@@H1]C', 'Cl', '[NH1+1][S@]CC(C)CCNC=CC=CC', 'CC=NC=C(O)C(CC=NOCC)=C', 'COC=O', 'O=[C@@H1]CCC1=CN[C@](C)=C1CO', 'N[C@]=C=O', 'O=C(COC)C1CC1C=N', 'C[C@H1]C[C@@H1]N[C@@H1]C1[C@@H1](C)OC1C[C@H1]', '[N-1]Cl', 'O=O', '[N@@H1+1]SCC(=O)N[C@@H1]S[C@H1](C)C[C@H1](O)C', 'S[N+1][NH1+1]=CNC(=O)[C@H1](C)C=NC=C', '[C@H1]']


In [31]:
from rdkit import Chem
from rdkit.Chem import AllChem
import py3Dmol

def visualize_3d(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Invalid SMILES: {smiles}")
        return

    # Add hydrogens and compute 3D coordinates
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, randomSeed=42)
    AllChem.UFFOptimizeMolecule(mol)

    # Convert to MolBlock for Py3Dmol
    mol_block = Chem.MolToMolBlock(mol)

    # Render in 3D
    view = py3Dmol.view(width=400, height=400)
    view.addModel(mol_block, 'mol')
    view.setStyle({'stick': {}})
    view.setBackgroundColor('0xeeeeee')
    view.zoomTo()
    return view.show()

# Visualize all valid molecules
for smi in valid_smiles[:5]:  # visualize first 5 for speed
    visualize_3d(smi)


In [33]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# Convert dataset SMILES to RDKit molecules and fingerprints
dataset_mols = [Chem.MolFromSmiles(s) for s in df_main['smile'] if Chem.MolFromSmiles(s) is not None]
dataset_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in dataset_mols]

# Convert generated SMILES to fingerprints
gen_mols = [Chem.MolFromSmiles(s) for s in valid_smiles]
gen_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in gen_mols]

# Compute maximum similarity to any dataset molecule
novel_molecules = []
for smi, fp in zip(valid_smiles, gen_fps):
    similarities = [DataStructs.TanimotoSimilarity(fp, dfp) for dfp in dataset_fps]
    max_sim = max(similarities)
    if max_sim < 0.7:  # threshold for novelty
        novel_molecules.append((smi, max_sim))

print("Truly novel molecules (similarity < 0.7):")
for smi, sim in novel_molecules:
    print(f"SMILES: {smi} | Max similarity: {sim:.3f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Truly novel molecules (similarity < 0.7):
SMILES: [NH1+1]CCOC=O | Max similarity: 0.194
SMILES: [NH1+1]#C | Max similarity: 0.080
SMILES: S\Cl | Max similarity: 0.100
SMILES: O1[NH1][C@H1]1NC=C[C@@H1]CO | Max similarity: 0.191
SMILES: [C@@H1][C@@H1](N)C=C1[C@@](Cl)C=CC1NC=N | Max similarity: 0.192
SMILES: CCNCC[C@@H1](C)N[C@@H1]C | Max similarity: 0.239
SMILES: Cl | Max similarity: 0.067
SMILES: [NH1+1][S@]CC(C)CCNC=CC=CC | Max similarity: 0.245
SMILES: CC=NC=C(O)C(CC=NOCC)=C | Max similarity: 0.298
SMILES: COC=O | Max similarity: 0.190
SMILES: O=[C@@H1]CCC1=CN[C@](C)=C1CO | Max similarity: 0.239
SMILES: N[C@]=C=O | Max similarity: 0.143
SMILES: O=C(COC)C1CC1C=N | Max similarity: 0.350
SMILES: C[C@H1]C[C@@H1]N[C@@H1]C1[C@@H1](C)OC1C[C@H1] | Max similarity: 0.188
SMILES: [N-1]Cl | Max similarity: 0.100
SMILES: O=O | Max similarity: 0.091
SMILES: [N@@H1+1]SCC(=O)N[C@@H1]S[C@H1](C)C[C@H1](O)C | Max similarity: 0.341
SMILES: S[N+1][NH1+1]=CNC(=O)[C@H1](C)C=NC=C | Max similarity: 0.265
SMIL

In [34]:
import requests

def check_pubchem(smiles):
    """
    Check if a molecule exists in PubChem by SMILES.
    Returns CID if found, else None.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/cids/JSON"
    try:
        response = requests.get(url, timeout=10)
        data = response.json()
        cids = data.get("IdentifierList", {}).get("CID", [])
        if cids:
            return cids[0]  # return first CID
        else:
            return None
    except Exception as e:
        print(f"Error checking {smiles}: {e}")
        return None

# Example usage on generated molecules
results = []
for smi in valid_smiles:
    cid = check_pubchem(smi)
    if cid:
        results.append((smi, cid, "Exists in PubChem"))
    else:
        results.append((smi, None, "Novel molecule"))

# Display results
for smi, cid, status in results:
    print(f"SMILES: {smi} | CID: {cid} | Status: {status}")


Error checking [NH1+1]#C: Expecting value: line 1 column 1 (char 0)
SMILES: [NH1+1]CCOC=O | CID: 152756574 | Status: Exists in PubChem
SMILES: [NH1+1]#C | CID: None | Status: Novel molecule
SMILES: S\Cl | CID: 14228601 | Status: Exists in PubChem
SMILES: O1[NH1][C@H1]1NC=C[C@@H1]CO | CID: None | Status: Novel molecule
SMILES: [C@@H1][C@@H1](N)C=C1[C@@](Cl)C=CC1NC=N | CID: None | Status: Novel molecule
SMILES: CCNCC[C@@H1](C)N[C@@H1]C | CID: 124301341 | Status: Exists in PubChem
SMILES: Cl | CID: 313 | Status: Exists in PubChem
SMILES: [NH1+1][S@]CC(C)CCNC=CC=CC | CID: None | Status: Novel molecule
SMILES: CC=NC=C(O)C(CC=NOCC)=C | CID: None | Status: Novel molecule
SMILES: COC=O | CID: 7865 | Status: Exists in PubChem
SMILES: O=[C@@H1]CCC1=CN[C@](C)=C1CO | CID: None | Status: Novel molecule
SMILES: N[C@]=C=O | CID: 5287405 | Status: Exists in PubChem
SMILES: O=C(COC)C1CC1C=N | CID: None | Status: Novel molecule
SMILES: C[C@H1]C[C@@H1]N[C@@H1]C1[C@@H1](C)OC1C[C@H1] | CID: None | Status: 