# Compound Similarity Clustering

This notebook demonstrates compound clustering based on molecular similarity using RDKit fingerprints.

In [None]:
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform
import numpy as np

from notebook_utils import suppress_rdkit_warnings, mol_from_smiles_safe

suppress_rdkit_warnings()

## Load Compounds

In [None]:
# Example SMILES list (or load from API)
smiles_list = [
    "CCO",  # Ethanol
    "CC(=O)O",  # Acetic acid
    "c1ccccc1",  # Benzene
    "CCc1ccccc1",  # Ethylbenzene
    "CC(=O)c1ccccc1",  # Acetophenone
    "CCN(CC)CC",  # Triethylamine
    "CC(C)O",  # Isopropanol
    "CCCCCCCCCC(=O)O",  # Decanoic acid
]

# Convert to RDKit molecules
molecules = []
valid_smiles = []
for smiles in smiles_list:
    mol = mol_from_smiles_safe(smiles)
    if mol:
        molecules.append(mol)
        valid_smiles.append(smiles)

print(f"Loaded {len(molecules)} valid molecules")

## Generate Morgan Fingerprints

In [None]:
# Generate Morgan fingerprints (radius=2, 2048 bits)
fingerprints = []
for mol in molecules:
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    fingerprints.append(fp)

print(f"Generated {len(fingerprints)} fingerprints")
print(f"Fingerprint length: {len(fingerprints[0])} bits")

## Calculate Tanimoto Similarity Matrix

In [None]:
# Calculate pairwise Tanimoto similarities
n = len(fingerprints)
similarity_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(n):
        similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
        similarity_matrix[i, j] = similarity

# Convert similarity to distance (1 - similarity)
distance_matrix = 1 - similarity_matrix

# Create DataFrame for visualization
df_similarity = pd.DataFrame(similarity_matrix, index=valid_smiles, columns=valid_smiles)
print("Similarity matrix:")
print(df_similarity.round(3))

## Hierarchical Clustering with Dendrogram

In [None]:
# Convert distance matrix to condensed form for linkage
condensed_distances = squareform(distance_matrix)

# Perform hierarchical clustering
linkage_matrix = linkage(condensed_distances, method='ward')

# Plot dendrogram
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix, labels=valid_smiles, leaf_rotation=90, leaf_font_size=8)
plt.title('Compound Clustering Dendrogram', fontsize=14, fontweight='bold')
plt.xlabel('Compound (SMILES)', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.tight_layout()
plt.show()

# Assign clusters (e.g., 3 clusters)
n_clusters = 3
clusters = fcluster(linkage_matrix, n_clusters, criterion='maxclust')
print(f"\nAssigned {n_clusters} clusters:")
for i, (smiles, cluster) in enumerate(zip(valid_smiles, clusters)):
    print(f"  {smiles}: Cluster {cluster}")

## 2D Visualization (PCA)

In [None]:
from sklearn.decomposition import PCA

# Convert fingerprints to numpy array
fp_array = np.array([np.array(fp) for fp in fingerprints])

# Apply PCA
pca = PCA(n_components=2)
pca_coords = pca.fit_transform(fp_array)

# Plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(pca_coords[:, 0], pca_coords[:, 1], c=clusters, cmap='viridis', s=100, alpha=0.7)
plt.colorbar(scatter, label='Cluster')

# Annotate points
for i, smiles in enumerate(valid_smiles):
    plt.annotate(smiles[:10], (pca_coords[i, 0], pca_coords[i, 1]), fontsize=8)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontsize=12)
plt.title('Compound Clustering - PCA Visualization', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()