In [1]:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from scipy.spatial import distance
from sklearn import manifold
from rdkit import DataStructs

from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

import matplotlib.pyplot as plt

In [2]:
supp = Chem.SDMolSupplier("../data/drugbank.sdf")
drugbank = [mol for mol in supp if mol]
fps = [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=1024) for m in drugbank]

In [3]:
sparse_mat = distance.pdist(fps,metric="jaccard")

In [None]:
dist_mat = distance.squareform(sparse_mat)

In [None]:
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, dissimilarity="precomputed")
X_mds = clf.fit_transform(dist_mat)

In [None]:
X_mds[0:10]

In [None]:
def mds_plot(X, *args, highlights=set()):
    plt.figure()
    ax = plt.subplot(111)
    for i, point in enumerate(X):
        x,y = point
        if i in highlights:
            plt.plot([x],[y],"r.", markersize=4)
        else:
            plt.plot([x],[y],"k.", markersize=1)
    plt.show()

In [None]:
mds_plot(X_mds)

In [None]:
pattern = Chem.MolFromSmiles("O=C(C)Oc1ccccc1C(=O)O")
pat_idxs = [i for i, mol in enumerate(drugbank) if mol.HasSubstructMatch(pattern)]

In [None]:
len(pat_idxs)

In [None]:
mds_plot(X_mds,highlights=set(pat_idxs))

In [None]:
1-dist_mat[pat_idxs,:][:,pat_idxs]

In [None]:
Draw.MolsToGridImage([mol for i,mol in enumerate(drugbank) if i in pat_idxs])

In [None]:
#def distance_function(index1, index2, *args, fingerprints=fps):
#    return 1 - DataStructs.FingerprintSimilarity(fingerprints[index1],fingerprints[index2])

def distance_function(index1, index2, *args, matrix=dist_mat):
    return matrix[index1,index2]

picker = MaxMinPicker()
picked_gen = picker.LazyPick(distance_function, len(drugbank), 100, seed=666)
picked = tuple(picked_gen)

In [None]:
mds_plot(X_mds,highlights=set(picked))

In [None]:
Draw.MolsToGridImage([mol for i,mol in enumerate(drugbank) if i in picked][:15])

In [None]:
carbon_pat = Chem.MolFromSmarts("[#6]")
carb_idxs = [i for i, mol in enumerate(drugbank) if mol.HasSubstructMatch(carbon_pat)]
noncarb_idxs = [i for i, mol in enumerate(drugbank) if not mol.HasSubstructMatch(carbon_pat)]
mds_plot(X_mds,highlights=set(noncarb_idxs))

In [None]:
Draw.MolsToGridImage([mol for i,mol in enumerate(drugbank) if i in noncarb_idxs][:15])