In [96]:
!pip install -q rdkit

In [97]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

from rdkit import Chem
from rdkit.Chem.AllChem import GetMorganGenerator

In [98]:
#As a separator I have used semicolon (sep=';'), but perhpas You may have to change it (e.g. for comma, which is default separator in pd.read_csv())

data_qikprop = pd.read_csv(r"filepath\qikprop.csv", sep=';')  #filepath to .csv file with qikprop output from Schrodinger Maestro Suite
data_cns = pd.read_csv(r"filepath\cns_mpo.csv", sep=';')  #filepath to .csv file with CNS MPO output (calculator availble at https://github.com/Adam-maz/CNS_MPO_calculator)
data_gscore = pd.read_csv(r"filepath\gscores.csv", sep=';')  #filepath to .csv file with glide gscores from Schrodinger Maestro Suite.
                                                             #ensure that You have collected glide gscores with appropriate SMILES (as column)

df_qikprop = pd.DataFrame(data_qikprop)
df_cns = pd.DataFrame(data_cns)
df_gscore = pd.DataFrame(data_gscore)

df_gscore = df_gscore.drop_duplicates(subset=['Id'], keep='first')

In [99]:
merged_df1 = pd.merge(df_cns, df_qikprop, on='Id')  #ensure that 'Id' is a common name for columns with molecules identificators within dataframes
merged_df_final = pd.merge(merged_df1, df_gscore, on='Id')  #ensure that 'Id' is a common name for columns with molecules identificators, within dataframes
merged_df_final = merged_df_final.drop(columns=['pKa_y'])
merged_df_final = merged_df_final.rename(columns={'pKa_x': 'pKa'})

In [100]:
fps_generator = GetMorganGenerator()
merged_df_final['Morgan_fps'] = merged_df_final['Smiles'].apply(lambda x: fps_generator.GetFingerprint(Chem.MolFromSmiles(x)))

ids = merged_df_final['Id']

In [101]:
binary_array = np.array([list(map(int, bin_str)) for bin_str in merged_df_final.iloc[:, -1]])
gscore_array = np.array(merged_df_final['glide gscore']).reshape(-1,1)
final_array = np.hstack((gscore_array, binary_array))

In [102]:
seeds_list = [13, 42, 71] #to make assay more accurate and robust I decided to use 3 different seeds and subsequently collect intersection of obtained 3 collections
results = {13: {}, 42: {}, 71: {}}

for seed in seeds_list:
    kmeans = KMeans(n_clusters=4, random_state=seed) #the value of n_clusters depends on Your needs
    kmeans.fit(final_array)
    y_kmeans = kmeans.predict(final_array)

    for idx, cluster in zip(ids, y_kmeans):
        results[seed][idx] = cluster

In [106]:
referent_cluster = results[13]['referent_compound']
print(f'Referent cluster, seed 13: {referent_cluster}')
list_with_referent_cluster_cpds_seed_13 = [key for key, value in results[13].items() if value == referent_cluster]

referent_cluster = results[42]['referent_compound']
print(f'Referent cluster, seed 42: {referent_cluster}')
list_with_referent_cluster_cpds_seed_42 = [key for key, value in results[42].items() if value == referent_cluster]

referent_cluster = results[71]['referent_compound']
print(f'Referent cluster, seed 71: {referent_cluster}')
list_with_referent_cluster_cpds_seed_71 = [key for key, value in results[71].items() if value == referent_cluster]

KeyError: 'referent_compound'

In [104]:
common_cpds = set(list_with_referent_cluster_cpds_seed_13) & set(list_with_referent_cluster_cpds_seed_42) & set(list_with_referent_cluster_cpds_seed_71)
common_cpds = list(common_cpds)

In [105]:
df_out = pd.DataFrame(common_cpds, columns=['Id'])
output = pd.merge(merged_df_final, df_out, on='Id', how='inner')

output.to_csv(r"filepath\cluster_analysis_output.csv", sep=';', index=False)