In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.spatial import distance
# Cell
class BaseDomain:
     def calculate_applicability_domain(self):
        pass

# Cell
class kNNDomain(BaseDomain):

    def __init__(self, Xref:np.array, metric='euclidean'):
#euclidean
        """Calculates the applicability domain using the k-nearest neighbours approach

        Attributes
        -----------------------------------------------------------------------------

        Xref : numpy.array
            Reference fingerprints

        metric : str or callable, optional
            The distance metric to use.
            If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
            ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
            ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
            ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.



        """


        self.Xref = Xref
        self.metric = metric
        self.ad_threshold = self.calculate_ad_threhold(metric=metric)


    @property
    def ad_threshold(self):
        return self._ad_threshold

    @ad_threshold.setter
    def ad_threshold(self, v):
        self._ad_threshold = v

    def calculate_similarity_from_array(self, fp1:np.array, fp2:np.array=None, metric:str=None, z=0.9):

        """Calculates a squared similarity matrix between two arrays of fingerprints

        Arguments
        -----------------------------------------------------------------------------

        fp1 : numpy.array
            An array of fingerprints.

        fp2 : numpy.array
            Second array of fingerprints. If None, defaults to `fp1`.

        metric : str or callable, optional
            The distance metric to use.
            If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
            ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
            ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
            ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.


        z : float
            Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x


        Returns
        -----------------------------------------------------------------------------

        simi_matrix : numpy.array
            A matrix with pairwise distance.


        """

        from scipy.spatial import distance
        if fp2 is None:
            simi_matrix = distance.cdist(fp1, fp1, metric=metric).squeeze()
        else:
            simi_matrix = distance.cdist(fp1, fp2, metric=metric).squeeze()
        return simi_matrix

    def calculate_ad_threhold(self, X:np.array=None,  metric:str=None, z=0.9):

        """Calculates the applicability domain threhold using k-NN method

        Arguments
        -----------------------------------------------------------------------------

        X : numpy.array
            An array of fingerprints.

        metric : str or callable, optional
            The distance metric to use.
            If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
            ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
            ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
            ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.

        z : float
            Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x


        Returns
        -----------------------------------------------------------------------------

        ad_threshold : float
            The distance threshold used to define the applicability domain.


        """

        from scipy.spatial import distance

        X = self.Xref if X is None else X
        simi_matrix = self.calculate_similarity_from_array(X, metric=metric).squeeze()

        std_distances = np.std(simi_matrix) # std of distances
        avg_distances = np.mean(simi_matrix) # average of distances

        # Applicability threhold
#         ad_threshold = (z*std_distances) + avg_distances
        ad_threshold = -(z*std_distances) + avg_distances
        return ad_threshold

    def get_knn(self, fp:np.array, ref_fp:np.array=None, k:int=10):

        """Get the k-nearest neighbours and calculate the distance between fp and ref_fps.

        Arguments
        -----------------------------------------------------------------------------
        fp : numpy.array
            Query fingerprint

        ref_fps : numpy.array
            Reference fingerprints

        k : int
            Number of k nearest neighbours


        Returns
        -----------------------------------------------------------------------------
        distances : numpy.array
            A matrix with pairwise distance.

        neighbours : numpy.array
            Index of k-nearest neighbours


        """

        # Generate similarity matrix
        ref_fp = self.Xref if ref_fp is None else ref_fp
        distances = self.calculate_similarity_from_array(fp, ref_fp, metric=self.metric).reshape(len(fp), len(ref_fp))
        # Get kNN
        neighbours = np.argsort(distances, axis=-1).reshape(len(fp), len(ref_fp))
        return distances, neighbours


    def calculate_applicability_domain(self, fp:np.array, ref_fp:np.array=None, k:int=10):


        """Get the k-nearest neighbours and calculate the distance between fp and ref_fps.

        Arguments
        -----------------------------------------------------------------------------

        fp : numpy.array
            Query fingerprint

        ref_fps : numpy.array
            Reference fingerprints

        k : int
            Number of k nearest neighbours

        Returns
        -----------------------------------------------------------------------------

        avg_distance : float
            Average distance between query and k-nearest neighbours in AD.


        """
        assert k >=1, "k must be >= 1."
        ref_fp = self.Xref if ref_fp is None else ref_fp
        distances, neighbours = self.get_knn(fp, ref_fp, k=k)
        avg_distance = np.take_along_axis(distances, neighbours[:, :k], 1)[:,-1]#.mean(-1)
        return (avg_distance,avg_distance<=self.ad_threshold) # euclidean


In [2]:
df_train = pd.read_csv('./train_set_descriptor_random.csv')
df_test = pd.read_csv('./logPapp_external_set.csv') 

In [3]:
Xtrain = np.array([AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in list(map(Chem.MolFromSmiles, df_train['cano_smiles'].to_list()))])
Xsample = np.array([AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in list(map(Chem.MolFromSmiles, df_test['smiles'].to_list()))])

In [4]:
ad_domain = kNNDomain(Xtrain)
avg_distance = ad_domain.calculate_applicability_domain(Xsample, k=6)

In [5]:
df = pd.read_csv('./external_ml_models_combined_morganbit_rdkit2d_pred.csv')

In [6]:
df = df[avg_distance[1]]

In [7]:
df

Unnamed: 0,XGB_fp,SVM_fp,GB_fp,RF_fp,XGB_rdkit2d,SVM_rdkit2d,GB_rdkit2d,RF_rdkit2d,XGB_combined,SVM_combined,GB_combined,RF_combined,ground_truth,CombinedNet,DMPNN,ADMETlab2,ADMETlab3
0,-5.643911,-5.961470,-5.483297,-5.538459,-6.046221,-5.898155,-6.283740,-5.811782,-6.051782,-6.142560,-5.732677,-5.881886,-4.452225,-5.658178,-6.070380,-5.484,-5.246999
1,-4.886450,-5.075859,-4.960182,-5.077284,-5.079074,-5.023228,-4.783831,-5.026886,-5.060615,-4.800673,-5.177343,-4.970772,-4.040959,-4.853692,-4.805622,-5.051,-4.981073
2,-4.976178,-5.074016,-4.835692,-5.037226,-4.904591,-5.303246,-4.719715,-4.850136,-5.279327,-5.102093,-5.616375,-4.855914,-4.795880,-5.287908,-5.190197,-4.865,-5.153186
3,-4.751775,-4.777076,-4.647408,-4.848243,-4.554111,-5.084939,-4.462949,-4.496145,-4.583272,-4.689909,-4.637891,-4.499963,-4.403403,-4.624857,-5.191446,-4.759,-4.802940
4,-6.015895,-5.779423,-6.181006,-5.652338,-5.540268,-5.542982,-5.384678,-5.646651,-5.809102,-6.185668,-5.298511,-5.611605,-6.221849,-6.113716,-5.984838,-5.558,-5.394623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,-5.826764,-5.378002,-5.836599,-5.869440,-5.537980,-5.498439,-5.598923,-5.601021,-5.692111,-5.241509,-5.625866,-5.592824,-5.968822,-5.490297,-5.645577,-5.316,-5.456930
267,-5.138891,-5.343050,-5.588239,-4.849472,-4.663667,-5.137931,-4.931800,-4.941493,-4.775093,-5.193229,-5.458596,-4.871698,-4.662868,-5.011135,-5.166664,-4.808,-4.833522
268,-5.157030,-5.593155,-4.901261,-5.119306,-5.908757,-5.624952,-5.672353,-5.672681,-5.440757,-5.688264,-5.494064,-5.640029,-4.736830,-5.302046,-5.351270,-5.189,-5.234597
269,-5.235996,-5.057087,-5.142637,-5.245434,-4.784398,-5.308624,-4.934321,-4.917437,-4.742096,-4.811804,-4.811503,-4.859504,-4.700732,-4.978535,-5.206163,-5.194,-5.023351


In [8]:
final_df = df.copy()

In [9]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [10]:
metrics_df = pd.DataFrame(index=['r2_score', 'MAE', 'RMSE','spearmanr','pearsonr'],columns=['XGB_fp',
                                                                                           'SVM_fp',
                                                                                            'GB_fp',
                                                                                            'RF_fp',
                                                                                            'XGB_rdkit2d',
                                                                                            'SVM_rdkit2d',
                                                                                            'GB_rdkit2d',
                                                                                            'RF_rdkit2d',
                                                                                            'XGB_combined',
                                                                                            'SVM_combined',
                                                                                            'GB_combined',
                                                                                            'RF_combined',
                                                                                            'CombinedNet',
                                                                                            'DMPNN',
                                                                                            'ADMETlab 2.0',
                                                                                            'ADMETlab 3.0'])

In [11]:
metrics_functions = {
    'r2_score': r2_score,
    'MAE': mean_absolute_error,
    'RMSE': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'spearmanr': spearmanr,
    'pearsonr': pearsonr
}

In [12]:
for metric_name, metric_function in metrics_functions.items():
    if metric_name == 'spearmanr' or metric_name == 'pearsonr':
        xgb = metric_function(final_df['ground_truth'], final_df['XGB_fp'])[0]
        svm = metric_function(final_df['ground_truth'], final_df['SVM_fp'])[0]
        gb = metric_function(final_df['ground_truth'], final_df['GB_fp'])[0]
        rf = metric_function(final_df['ground_truth'], final_df['RF_fp'])[0]
        xgb_2d = metric_function(final_df['ground_truth'], final_df['XGB_rdkit2d'])[0]
        svm_2d = metric_function(final_df['ground_truth'], final_df['SVM_rdkit2d'])[0]
        gb_2d = metric_function(final_df['ground_truth'], final_df['GB_rdkit2d'])[0]
        rf_2d = metric_function(final_df['ground_truth'], final_df['RF_rdkit2d'])[0]
        xgb_com = metric_function(final_df['ground_truth'], final_df['XGB_combined'])[0]
        svm_com = metric_function(final_df['ground_truth'], final_df['SVM_combined'])[0]
        gb_com = metric_function(final_df['ground_truth'], final_df['GB_combined'])[0]
        rf_com = metric_function(final_df['ground_truth'], final_df['RF_combined'])[0]
        combinednet = metric_function(final_df['ground_truth'], final_df['CombinedNet'])[0]
        dmpnn = metric_function(final_df['ground_truth'], final_df['DMPNN'])[0]
        lab2 = metric_function(final_df['ground_truth'], final_df['ADMETlab2'])[0]
        lab3 = metric_function(final_df['ground_truth'], final_df['ADMETlab3'])[0]
        metrics_df.loc[metric_name, 'XGB_fp',] = xgb
        metrics_df.loc[metric_name, 'SVM_fp',] = svm
        metrics_df.loc[metric_name, 'GB_fp',] = gb
        metrics_df.loc[metric_name, 'RF_fp',] = rf
        metrics_df.loc[metric_name, 'XGB_rdkit2d',] = xgb_2d
        metrics_df.loc[metric_name, 'SVM_rdkit2d',] = svm_2d
        metrics_df.loc[metric_name, 'GB_rdkit2d',] = gb_2d
        metrics_df.loc[metric_name, 'RF_rdkit2d',] = rf_2d
        metrics_df.loc[metric_name, 'XGB_combined',] = xgb_com
        metrics_df.loc[metric_name, 'SVM_combined',] = svm_com
        metrics_df.loc[metric_name, 'GB_combined',] = gb_com
        metrics_df.loc[metric_name, 'RF_combined',] = rf_com
        metrics_df.loc[metric_name, 'CombinedNet',] = combinednet
        metrics_df.loc[metric_name, 'DMPNN',] = dmpnn
        metrics_df.loc[metric_name, 'ADMETlab 2.0',] = lab2
        metrics_df.loc[metric_name, 'ADMETlab 3.0'] = lab3
    else:        
        xgb = metric_function(final_df['ground_truth'], final_df['XGB_fp'])
        svm = metric_function(final_df['ground_truth'], final_df['SVM_fp'])
        gb = metric_function(final_df['ground_truth'], final_df['GB_fp'])
        rf = metric_function(final_df['ground_truth'], final_df['RF_fp'])
        xgb_2d = metric_function(final_df['ground_truth'], final_df['XGB_rdkit2d'])
        svm_2d = metric_function(final_df['ground_truth'], final_df['SVM_rdkit2d'])
        gb_2d = metric_function(final_df['ground_truth'], final_df['GB_rdkit2d'])
        rf_2d = metric_function(final_df['ground_truth'], final_df['RF_rdkit2d'])
        xgb_com = metric_function(final_df['ground_truth'], final_df['XGB_combined'])
        svm_com = metric_function(final_df['ground_truth'], final_df['SVM_combined'])
        gb_com = metric_function(final_df['ground_truth'], final_df['GB_combined'])
        rf_com = metric_function(final_df['ground_truth'], final_df['RF_combined'])
        combinednet = metric_function(final_df['ground_truth'], final_df['CombinedNet'])
        dmpnn = metric_function(final_df['ground_truth'], final_df['DMPNN'])
        lab2 = metric_function(final_df['ground_truth'], final_df['ADMETlab2'])
        lab3 = metric_function(final_df['ground_truth'], final_df['ADMETlab3'])
        metrics_df.loc[metric_name, 'XGB_fp',] = xgb
        metrics_df.loc[metric_name, 'SVM_fp',] = svm
        metrics_df.loc[metric_name, 'GB_fp',] = gb
        metrics_df.loc[metric_name, 'RF_fp',] = rf
        metrics_df.loc[metric_name, 'XGB_rdkit2d',] = xgb_2d
        metrics_df.loc[metric_name, 'SVM_rdkit2d',] = svm_2d
        metrics_df.loc[metric_name, 'GB_rdkit2d',] = gb_2d
        metrics_df.loc[metric_name, 'RF_rdkit2d',] = rf_2d
        metrics_df.loc[metric_name, 'XGB_combined',] = xgb_com
        metrics_df.loc[metric_name, 'SVM_combined',] = svm_com
        metrics_df.loc[metric_name, 'GB_combined',] = gb_com
        metrics_df.loc[metric_name, 'RF_combined',] = rf_com
        metrics_df.loc[metric_name, 'CombinedNet',] = combinednet
        metrics_df.loc[metric_name, 'DMPNN',] = dmpnn
        metrics_df.loc[metric_name, 'ADMETlab 2.0',] = lab2
        metrics_df.loc[metric_name, 'ADMETlab 3.0'] = lab3

In [13]:
metrics_df.T

Unnamed: 0,r2_score,MAE,RMSE,spearmanr,pearsonr
XGB_fp,0.031645,0.5831,0.751037,0.244245,0.288507
SVM_fp,-0.076154,0.638008,0.791737,0.201158,0.214406
GB_fp,-0.064993,0.604724,0.787621,0.192076,0.22945
RF_fp,0.047307,0.569503,0.744938,0.25193,0.25901
XGB_rdkit2d,0.031803,0.598312,0.750975,0.236291,0.305281
SVM_rdkit2d,0.027968,0.636426,0.752461,0.282943,0.310243
GB_rdkit2d,-0.014733,0.606663,0.768811,0.22546,0.29702
RF_rdkit2d,0.074564,0.593158,0.734204,0.284961,0.321225
XGB_combined,0.069034,0.578107,0.736395,0.308202,0.345806
SVM_combined,-0.005377,0.604113,0.765259,0.284844,0.317639
