In [None]:
pip install rdkit pandas seaborn tqdm mols2grid

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Collecting mols2grid
  Downloading mols2grid-2.1.0-py3-none-any.whl.metadata (15 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets>=7.6.0->anywidget->mols2grid)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mols2grid-2.1.0-py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.1/83.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit, jedi, mols2grid
Successfully installed jedi-0.19.2 mols2grid-2.1.0 rdkit

In [None]:
from rdkit import Chem
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import DataStructs

In [None]:
def butina_cluster(mol_list, cutoff=0.35):
  generator=rdFingerprintGenerator.GetMorganGenerator(3,fpSize=2048)
  fp_list=[generator.GetFingerprint(x) for x in mol_list]

  distances=[]
  num_fps=len(fp_list)

  for i in range(1, num_fps):
    similarities= DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
    distances.extend([1-x for x in similarities])

  mol_clusters=Butina.ClusterData(distances,num_fps,cutoff,isDistData=True)
  clusters_id_list=[0]*num_fps
  for idx, cluster in enumerate(mol_clusters,1):
    for member in cluster:
      clusters_id_list[member]=idx

  return clusters_id_list


In [None]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/PatWalters/practical_cheminformatics_tutorials/main/data/dude_erk2_mk01.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active
0,0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCC...,168691,1
1,1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2...,86358,1
2,2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575087,1
3,3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575065,1
4,4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575047,1


In [None]:
import mols2grid
mols2grid.display(df)

<mols2grid.widget.MolGridWidget object at 0x7b24c32c7290>

In [None]:
df['structure']=df.SMILES.apply(Chem.MolFromSmiles)

**Cluster the molecules in the dataframe**

In [None]:
df['cluster']=butina_cluster(df.structure.values)

**View the dataframe with the new Cluster column**

In [None]:
mols2grid.display(df,subset=['img','ID','cluster'])

<mols2grid.widget.MolGridWidget object at 0x7b24c2a4f740>

Select the molecule from each cluster with the lowest LogP.
- calculate the LogP for each molecule
- Put these values into a new column called "logP".

In [None]:
from rdkit.Chem import Crippen
df['LogP']=df.structure.apply(Crippen.MolLogP)

In [None]:
mols2grid.display(df, subset=['img','ID','cluster','LogP'],transform={'LogP':lambda x: f"{x:.2f}"})

<mols2grid.widget.MolGridWidget object at 0x7b24c2a4fcb0>

Let's sort the dataframe

In [None]:
df.sort_values(['cluster','LogP'], inplace=True)

In [None]:
mols2grid.display(df,subset=['img','ID','cluster','LogP'], transform={'LogP': lambda x: f'{x:.2f}'})

<mols2grid.widget.MolGridWidget object at 0x7b24c2a4fbc0>

let's create a new dataframe containing only the molecule from each cluster with the lowest LogP.

In [None]:
df_unique=df.drop_duplicates('cluster')

In [None]:
mols2grid.display(df_unique,subset=['img','ID','cluster','LogP'], transform={'LogP': lambda x: f'{x:.2f}'})

<mols2grid.widget.MolGridWidget object at 0x7b24c2a09e50>