In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

from oddt import virtualscreening
from oddt.virtualscreening import electroshape

In [49]:
dude_target_dict = {
    'CAH2': 'CHEMBL205',
    'JAK2': 'CHEMBL2971',
    'ACES': 'CHEMBL220',
    'VGFR2': 'CHEMBL4142',
    'BACE1': 'CHEMBL4822',
    'EGFR': 'CHEMBL203',
    'AA2AR': 'CHEMBL251',
    'FA10': 'CHEMBL244'
}


In [106]:
file_name = '10_dude_data_plus_rdkit_descriptors.csv'
df = pd.read_csv(os.path.join('data',file_name)) # ! diff path
## select columns of interest
### note that for all columns, standard_type='Ki', standard_relation='=', data_validity_comment=NaN, potential_duplicate=0, assay_type='B'
col_meta = ['target_chembl_id', 'dude_name', 'active', 'uniquekey']
col_features = [d[0] for d in Descriptors.descList]
columns = col_meta.copy()
columns.extend(col_features)
fulldata = df[columns]
features = df[col_features].to_numpy()

labels = [ str(x) + 'active' for x in dude_target_dict ]
labels.extend([ str(x) + 'decoy' for x in dude_target_dict ])
labels = np.array(labels)

#np.isnan(features).any()
#features[ np.isnan(features) ] = 0
# Remove rows with nans
fulldata = fulldata[~np.isnan(features).any(axis=1)]
features = features[~np.isnan(features).any(axis=1)]
#np.isinf(features).any()
#features[ np.isinf(features) ] = 0
# Remove rows with inf
fulldata = fulldata[~np.isinf(features).any(axis=1)]
features = features[~np.isinf(features).any(axis=1)]

(n_samples, n_features), n_digits = features.shape, len(labels)
print(
    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
)


# digits: 16; # samples: 201318; # features 200


In [101]:
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                 metric="euclidean", sample_size=300,)
    ]

    # Show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))

In [107]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
                random_state=0)
kmeans = kmeans.fit(features)
kmeans.labels_


print(82 * '_')
kmeans.labels_
#kmeans.cluster_centers_

__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
__________________________________________________________________________________


array([13, 13,  5, ...,  5,  5,  5])

In [None]:
bench_k_means(kmeans=kmeans, name="k-means++", data=features, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random", data=features, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based", data=features, labels=labels)