In [1]:
import sys
sys.path.append('..') 
from helper_fun import *

from split_furthest_cluster import *
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

from time import time
import warnings

warnings.simplefilter('ignore', UserWarning)


2024-07-25 15:49:20.989446: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:

def tanimoto_distance_matrix(fp_list):
    """Calculate distance matrix for fingerprint list"""
    # distance_matrix = []
    # # Notice how we are deliberately skipping the first and last items in the list
    # # because we don't need to compare them against themselves
    # for i in range(1, len(fp_list)):
    #     # Compare the current fingerprint against all the previous ones in the list
    #     similarities = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
    #     # Since we need a distance matrix, calculate 1-x for every element in similarity matrix
    #     distance_matrix.extend([1 - x for x in similarities])
    # return distance_matrix
    n = len(fp_list)
    distance_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(i):
            similarity = DataStructs.TanimotoSimilarity(fp_list[i], fp_list[j])
            distance = 1 - similarity
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance  # Mirror the distance
    
    return distance_matrix

def assign_cluster_id(df_data, cluster_labels):
    '''df_data is a data frame that contains only CID and SMILES columns
    '''
    print('\nAssign cluster ID')
    df_data['Cluster_ID'] = cluster_labels
    return df_data

#define a function that takes in a list of fingerprints and a cutoff value and returns the equivalent cluster labels
def hierarchical_cluster_fingerprints(table, distance_threshold=0.2, CID_column='CID', SMILES_column='SMILES'):
    """Cluster fingerprints
    Input: whole dataframe (compounds)
    Parameters:
        fingerprints
        cutoff: threshold for the clustering, 0.2 is usual
    """
    t0 = time()
    # Generate fingerprints
    compounds_list = [(Chem.MolFromSmiles(smiles), chembl_id) for _, chembl_id, smiles in table[[CID_column, SMILES_column]].itertuples()]
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    fingerprints = [rdkit_gen.GetFingerprint(mol) for mol, idx in compounds_list]

    # Perform hierarchical clustering
    df_data = table[[CID_column, SMILES_column]]

    # Calculate the Tanimoto distance matrix
    distance_matrix = tanimoto_distance_matrix(fingerprints)
    # Perform agglomerative clustering
    cluster = AgglomerativeClustering(metric="euclidean", linkage='ward', distance_threshold=distance_threshold, n_clusters=None)
    cluster.fit(distance_matrix)


    tf = time() - t0
    # Assign cluster ID
    df_clusters = assign_cluster_id(pd.DataFrame(fingerprints), cluster.labels_)
    # Metrics
    s1 = silhouette_score(distance_matrix, cluster.labels_, metric='euclidean')
    c1 = calinski_harabasz_score(distance_matrix, cluster.labels_)
    d1 = davies_bouldin_score(distance_matrix, cluster.labels_)
    df_metrics = pd.DataFrame(data=[[tf, s1, c1, d1]],
                              columns=['Time', 'Silhouette', 'CH score', 'DB score'])
    return df_metrics, df_clusters


In [8]:
# Create molecules from SMILES and store in array
compounds = pd.read_csv("../COVID_MOONSHOT/compounds_filtered.csv")

# # Generate fingerprints
# compounds_list = [(Chem.MolFromSmiles(smiles), chembl_id) for _, chembl_id, smiles in compounds[["CID", "SMILES"]].itertuples()]
# rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
# fingerprints = [rdkit_gen.GetFingerprint(mol) for mol, idx in compounds_list]

# # Perform hierarchical clustering
# df_data = compounds[["CID", "SMILES"]]
df_metrics, df_clusters = hierarchical_cluster_fingerprints(compounds)
print(df_metrics)
print(df_clusters.head())




Assign cluster ID
       Time  Silhouette      CH score  DB score
0  1.630398    0.444689  11845.507041  0.059614
                                                   0  Cluster_ID
0  [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...         389
1  [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...         346
2  [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...         421
3  [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...         404
4  [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...          42
