Code supporting the article. 
As the structures of the compounds are not provided, only the morphological profile based classifier can be tested. 

In [7]:
import pandas as pd
import numpy as np

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, MACCSkeys, rdFMCS, MolStandardize
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.ML.Cluster import Butina

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import recall_score, balanced_accuracy_score, matthews_corrcoef, accuracy_score, precision_score
from sklearn.neighbors import KNeighborsClassifier

# getting acute toxicity label

In [None]:
# we load a csv containging the concensur profiles, the acute toxicity label, and the butina clusters
cellPainting_consensusMorphologicalProfile_acuteToxicity = pd.read_csv('cellPainting_consensusMorphologicalProfile_acuteToxicity.csv')
CaravaggioDalitFeatSeat01 = pd.read_csv('../../AcuteTox/Article/CaravaggioDalitFeatSeat01.csv').iloc[:, 0].values

# chemical structure based classifier

In [None]:
smiles_df

## cleaning smiles

In [None]:
def standardize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol) 
    # if many fragments, get the "parent" (the actual mol we are interested in) 
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)    
    te = rdMolStandardize.TautomerEnumerator() # idem
    taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    return Chem.MolToSmiles(taut_uncharged_parent_clean_mol, isomericSmiles=False)

## morgan fingerprint

In [None]:
def createChemicalSpace (smiles_df, smilesColumn):
    #we create a df of unique smiles. This df will be used to return the coordinates
    res_df = smiles_df.loc[:, smilesColumn].drop_duplicates()
    #we get rid of the Nan
    res_df = res_df[res_df.notna()]
    print('Number of unique compounds in data:', len(res_df.values))

    # we transform the smiles into an rdkit object
    rdkitObject = [Chem.MolFromSmiles(m) for m in res_df.values]

    # we create a list of boolean telling if an object has been created
    validRdkitObject = [False if o is None else True for o in rdkitObject]
    
    # we display the smiles that couldn't be converted into a rdkit object
    print ("Error with those smiles: ")
    print(res_df[~np.array(validRdkitObject)])
        
    # we get rid of the lines where the rdkitObject is None in the dataframe containing the smiles
    res_df = res_df[validRdkitObject]
    
    # now we create the list of valid molecules, meaning the list of rdkitObject that are not None
    mols = [o for o in rdkitObject if o is not None ]
    #calculate Morgan fingerprints as bit vectors:
    fps = [AllChem.GetMorganFingerprintAsBitVect(m,2,1024) for m in mols]
    #fpgen = [AllChem.GetRDKitFPGenerator(m), for m in mols]
    
    #now we get the list of bit vectors
    fps_bits = [(np.frombuffer(fp.ToBitString().encode(), 'u1') - ord('0')).tolist() for fp in fps]
    fps_bits = pd.DataFrame(fps_bits)

    print('Number of molecules OK in data:', len(mols))
    print('Number of Fingerprints in data:', len(fps_bits))
    
    res_df = pd.concat([res_df.reset_index(drop=True), fps_bits.reset_index(drop=True)], axis=1, sort=False)
    return  pd.merge(smiles_df, res_df, on=smilesColumn, how='left')

## Butina clusters


In [None]:
def computeButinaClusters(smiles_list, cutoff = 0.7)
    mols = []
    for smiles in all_smiles:
        mols.append(Chem.MolFromSmiles(smiles))
    fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in mols]
    # calcaulate scaffold sets
    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])
    scaffold_sets = Butina.ClusterData(dists,
                                       nfps,
                                       cutoff,
                                       isDistData=True,
                                      reordering = True)
    butinaCluster = pd.DataFrame( index = all_smiles.index, columns = ['cluster']) 
    for cl in range(len(scaffold_sets)):
        for i in scaffold_sets[cl]:
            butinaCluster.iloc[i]['cluster'] = cl
    return butinaCluster

## classifiers

### known chemistry case

In [None]:
nb_cross_cross_validation = 10
nb_fold = 10
SEED = 24
results_df = pd.DataFrame([])
special_res_df = pd.DataFrame([])
testres_df = pd.DataFrame([])
split = 0

x = smiles_df
y= smiles_df.Tox60

#model
model = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'jaccard', n_jobs = -1)


for CrossValIteration in range(nb_cross_cross_validation):

    skf = StratifiedKFold(n_splits=nb_fold, shuffle = True, random_state = SEED + (CrossValIteration*1))

    for train, test in skf.split(y, y):
        xTrain = x.iloc[train, :]
        xTest = x.iloc[test, :]
        yTrain = y.iloc[train]
        yTest = y.iloc[test]

        model.fit(xTrain, yTrain)

        yPredicted = model.predict(xTest)

        # metric
        BA = balanced_accuracy_score(yTest, yPredicted)
        MCC = matthews_corrcoef(yTest, yPredicted)
        SN = recall_score(yTest, yPredicted)
        SP = 2*BA - SN
        PR = precision_score(yTest, yPredicted)


        results_df = pd.concat( [results_df, pd.DataFrame([[BA, MCC, SN, SP, PR]],
            columns = ['BA', 'MCC', 'SN', 'SP', 'PR'])])

print (seeed)
print('BA: '+str(results_df.BA.mean())+ ' +/- ' +str(results_df.BA.std()))
print('MCC: '+str(results_df.MCC.mean())+ ' +/- ' +str(results_df.MCC.std()))
print('SN: '+str(results_df.SN.mean())+ ' +/- ' +str(results_df.SN.std()))
print('SP: '+str(results_df.SP.mean())+ ' +/- ' +str(results_df.SP.std()))

### new chemistry case

In [None]:
nb_cross_cross_validation = 10
nb_fold = 10
SEED = 24
results_df = pd.DataFrame([])
special_res_df = pd.DataFrame([])
testres_df = pd.DataFrame([])

x = smiles_df
y= smiles_df.Tox60
butinaCluster = smiles_df.cluster

#model
model = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'jaccard', n_jobs = -1)


for CrossValIteration in range(nb_cross_cross_validation):

    sgkf = StratifiedGroupKFold(n_splits=nb_fold, shuffle = True, random_state = SEED + (CrossValIteration*1))

    for train, test in sgkf.split(x, y, butinaCluster):
        xTrain = x.iloc[train, :]
        xTest = x.iloc[test, :]
        yTrain = y.iloc[train]
        yTest = y.iloc[test]

        model.fit(xTrain, yTrain)

        yPredicted = model.predict(xTest)

        # metric
        BA = balanced_accuracy_score(yTest, yPredicted)
        MCC = matthews_corrcoef(yTest, yPredicted)
        SN = recall_score(yTest, yPredicted)
        SP = 2*BA - SN
        PR = precision_score(yTest, yPredicted)


        results_df = pd.concat( [results_df, pd.DataFrame([[BA, MCC, SN, SP, PR]],
            columns = ['BA', 'MCC', 'SN', 'SP', 'PR'])])

print (seeed)

results_df = results_df.query('MCC != 0')                  
print('BA: '+str(results_df.BA.mean())+ ' +/- ' +str(results_df.BA.std()))
print('MCC: '+str(results_df.MCC.mean())+ ' +/- ' +str(results_df.MCC.std()))
print('SN: '+str(results_df.SN.mean())+ ' +/- ' +str(results_df.SN.std()))
print('SP: '+str(results_df.SP.mean())+ ' +/- ' +str(results_df.SP.std()))

In [None]:

# chemical structure based classifier
## cleaning smiles
## morgan fingerprint
## Butina clusters
## classifiers
### known chemistry case
### new chemistry case

# morphological profiles based classifier
## feature selection
## grit score calculation
## classifiers
### known chemistry case
### new chemistry case

# Decision support model
### known chemistry case
### new chemistry case

# statistical test

# Cleaning smiles

In [None]:
AROS_woIsomeres.loc[:, 'standardized_smiles'] = AROS_woIsomeres.smiles.apply(standardize)
AROS_woIsomeres_MorganFP = createChemicalSpace(AROS_woIsomeres, 'standardized_smiles' )
# we convert to boolean
AROS_woIsomeres_MorganFP.iloc[:, 4:] = AROS_woIsomeres_MorganFP.iloc[:, 4:].astype(bool)

In [None]:
# Random split
x = AROS_woIsomeres_MorganFP
y = AROS_woIsomeres_MorganFP.Tox60

modelDict = {'knn1': KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'jaccard', n_jobs = -1)}

featureSetDict = {'MorganFP': AROS_woIsomeres_MorganFP.iloc[:, 4:].columns,
                    }

featureSelection = {
                   'no': [(lambda a,b,c: [('', a.columns)] ), [] ]
                          }

res = cp.runModels (x, y, featureSetDict, featureSelection, modelDict, NbCrossVal = 10, NbSplit = 10, SEED = 77, 
                 splitterFunction = cp.StratifiedKFoldSplitter, fingerprint_df =x.smiles, testing_set_split = 0, TestsplitterFunction = cp.ValStratifiedShuffleSplitter)

res[0].groupby(['model', 'featureSet', 'featureSelection', 'param'], as_index = False).mean().nlargest(15, 'MCC')

# cell painting classifier

In [2]:
# getting the raw profiles
# feature selection
# grit score calculation
# KNN

In [123]:
cellPainting_consensusMorphologicalProfile_acuteToxicity = pd.read_csv('cellPainting_consensusMorphologicalProfile_acuteToxicity.csv')
CaravaggioDalitFeatSeat01 = pd.read_csv('../../AcuteTox/Article/CaravaggioDalitFeatSeat01.csv').iloc[:, 0].values

In [124]:
cellPainting_consensusMorphologicalProfile_acuteToxicity

Unnamed: 0,Metadata_compound,Tox60,Metadata_concentration,Metadata_Count_Cells,Image_Granularity_10_AGP,Image_Granularity_10_DNA,Image_Granularity_10_ER,Image_Granularity_10_Mito,Image_Granularity_10_RNA,Image_Granularity_11_AGP,...,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Cells_Cell_count,cluster
0,compound0,1.0,10.0,2295.5,-0.126248,0.498970,-0.155914,0.288963,-0.371613,1.283388,...,-0.060080,-0.065525,-0.052419,-0.059508,-0.062345,-0.047489,-0.046447,-0.069454,0.350456,16
1,compound0,1.0,31.6,2021.5,-0.208889,0.891883,0.853915,1.252502,0.951382,0.158212,...,0.041744,0.023693,0.022097,0.013678,0.019589,0.015937,0.034240,0.033334,-0.683276,16
2,compound0,1.0,100.0,1185.5,0.727887,5.224783,0.143396,0.529805,0.802005,1.131148,...,2.359023,2.370961,2.415749,2.371750,2.356397,2.365917,2.350114,2.337130,-3.470821,16
3,compound1,1.0,10.0,2451.5,-0.623006,-0.078494,-0.149774,-0.374737,-0.188433,-0.831162,...,0.030852,0.024664,0.047980,0.018150,0.037326,-0.003767,0.032979,0.051484,0.430804,90
4,compound1,1.0,31.6,2462.0,-0.605625,0.189449,-0.206134,-0.364609,0.015612,-0.627725,...,-0.030544,-0.044909,-0.043572,-0.039351,-0.026216,-0.048011,-0.047769,-0.044999,0.404298,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,compound224,1.0,31.6,2100.5,-0.508788,-0.870722,-0.371408,-0.610486,-0.411046,0.322197,...,1.184140,1.217768,1.194241,1.231501,1.185741,1.230391,1.210178,1.185785,-0.182494,0
674,compound224,1.0,100.0,2206.5,0.170549,-0.351740,-1.029492,-0.674743,-0.649602,0.719616,...,0.979733,0.980235,0.971507,0.981230,0.982114,1.001524,0.967793,0.971249,0.296325,0
675,compound225,1.0,10.0,2354.0,-0.052680,-0.105934,-0.403630,-0.395604,0.006304,-0.263931,...,1.015859,1.021132,1.017929,1.021267,1.038391,1.048786,1.032464,1.023689,0.056523,0
676,compound225,1.0,31.6,2141.0,0.191005,-0.895850,-1.058931,-1.249518,-0.323105,0.102419,...,0.551998,0.568468,0.548810,0.564210,0.572278,0.557854,0.573641,0.565157,-0.107756,0


### Known chemistry case

In [132]:
nb_cross_cross_validation = 10
nb_fold = 10
SEED = 24
results_df = pd.DataFrame([])
special_res_df = pd.DataFrame([])
testres_df = pd.DataFrame([])
split = 0

x = cellPainting_consensusMorphologicalProfile_acuteToxicity.query('Metadata_concentration == 31.6').loc[:, CaravaggioDalitFeatSeat01]
y= cellPainting_consensusMorphologicalProfile_acuteToxicity.query('Metadata_concentration == 31.6').loc[:, 'Tox60']

#model
model = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'correlation', n_jobs = -1)


for CrossValIteration in range(nb_cross_cross_validation):

    skf = StratifiedKFold(n_splits=nb_fold, shuffle = True, random_state = SEED + (CrossValIteration*1))

    for train, test in skf.split(y, y):
        xTrain = x.iloc[train, :]
        xTest = x.iloc[test, :]
        yTrain = y.iloc[train]
        yTest = y.iloc[test]

        model.fit(xTrain, yTrain)

        yPredicted = model.predict(xTest)

        # metric
        BA = balanced_accuracy_score(yTest, yPredicted)
        MCC = matthews_corrcoef(yTest, yPredicted)
        SN = recall_score(yTest, yPredicted)
        SP = 2*BA - SN
        PR = precision_score(yTest, yPredicted)


        results_df = pd.concat( [results_df, pd.DataFrame([[BA, MCC, SN, SP, PR]],
            columns = ['BA', 'MCC', 'SN', 'SP', 'PR'])])

print (seeed)
print('BA: '+str(results_df.BA.mean())+ ' +/- ' +str(results_df.BA.std()))
print('MCC: '+str(results_df.MCC.mean())+ ' +/- ' +str(results_df.MCC.std()))
print('SN: '+str(results_df.SN.mean())+ ' +/- ' +str(results_df.SN.std()))
print('SP: '+str(results_df.SP.mean())+ ' +/- ' +str(results_df.SP.std()))

24
BA: 0.7420075757575758 +/- 0.08219668758476652
MCC: 0.49452000130881457 +/- 0.1660776675408247
SN: 0.701818181818182 +/- 0.1253295180880479
SP: 0.7821969696969697 +/- 0.12226630184344665


In [130]:
nb_cross_cross_validation = 10
nb_fold = 10
SEED = 24
results_df = pd.DataFrame([])
special_res_df = pd.DataFrame([])
testres_df = pd.DataFrame([])

x = cellPainting_consensusMorphologicalProfile_acuteToxicity.query('Metadata_concentration == 31.6').loc[:, CaravaggioDalitFeatSeat01]
y= cellPainting_consensusMorphologicalProfile_acuteToxicity.query('Metadata_concentration == 31.6').loc[:, 'Tox60']
butinaCluster = cellPainting_consensusMorphologicalProfile_acuteToxicity.query('Metadata_concentration == 31.6').loc[:, 'cluster']

#model
model = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'correlation', n_jobs = -1)


for CrossValIteration in range(nb_cross_cross_validation):

    sgkf = StratifiedGroupKFold(n_splits=nb_fold, shuffle = True, random_state = SEED + (CrossValIteration*1))

    for train, test in sgkf.split(x, y, butinaCluster):
        xTrain = x.iloc[train, :]
        xTest = x.iloc[test, :]
        yTrain = y.iloc[train]
        yTest = y.iloc[test]

        model.fit(xTrain, yTrain)

        yPredicted = model.predict(xTest)

        # metric
        BA = balanced_accuracy_score(yTest, yPredicted)
        MCC = matthews_corrcoef(yTest, yPredicted)
        SN = recall_score(yTest, yPredicted)
        SP = 2*BA - SN
        PR = precision_score(yTest, yPredicted)


        results_df = pd.concat( [results_df, pd.DataFrame([[BA, MCC, SN, SP, PR]],
            columns = ['BA', 'MCC', 'SN', 'SP', 'PR'])])

print (seeed)

results_df = results_df.query('MCC != 0')                  
print('BA: '+str(results_df.BA.mean())+ ' +/- ' +str(results_df.BA.std()))
print('MCC: '+str(results_df.MCC.mean())+ ' +/- ' +str(results_df.MCC.std()))
print('SN: '+str(results_df.SN.mean())+ ' +/- ' +str(results_df.SN.std()))
print('SP: '+str(results_df.SP.mean())+ ' +/- ' +str(results_df.SP.std()))

24
BA: 0.7193065207832812 +/- 0.11883594111574444
MCC: 0.41765812443150213 +/- 0.22523895833792934
SN: 0.6633066363931305 +/- 0.21878613520945045
SP: 0.7753064051734322 +/- 0.14730494511774617


In [45]:
results_df.query('MCC != 0').MCC.mean()

0.410922487147136

In [None]:
# source https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html
import numpy as np
from scipy.stats import t


def corrected_std(differences, n_train, n_test):
    """Corrects standard deviation using Nadeau and Bengio's approach.

    Parameters
    ----------
    differences : ndarray of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    corrected_std : float
        Variance-corrected standard deviation of the set of differences.
    """
    # kr = k times r, r times repeated k-fold crossvalidation,
    # kr equals the number of times the model was evaluated
    kr = len(differences)
    corrected_var = np.var(differences, ddof=1) * (1 / kr + n_test / n_train)
    corrected_std = np.sqrt(corrected_var)
    return corrected_std


def compute_corrected_ttest(differences, df, n_train, n_test):
    """Computes right-tailed paired t-test with corrected variance.

    Parameters
    ----------
    differences : array-like of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    df : int
        Degrees of freedom.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    t_stat : float
        Variance-corrected t-statistic.
    p_val : float
        Variance-corrected p-value.
    """
    mean = np.mean(differences)
    std = corrected_std(differences, n_train, n_test)
    t_stat = mean / std
    p_val = t.sf(np.abs(t_stat), df)  # right-tailed t-test
    return t_stat, p_val

# Decision support model

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
class DecisionSupportModel(ClassifierMixin, BaseEstimator):

    def __init__(self, clf):
        # the two KNN classifiers
        self.KNN_CP31 = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'correlation', n_jobs = -1)
        self.KNN_MFP = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', metric = 'jaccard', n_jobs = -1)
        self.clf = clf
        self.CPfeatureSet = CaravaggioDalitFeatSeat01

    def fit(self, X, y):
 
        self.X_ = X
        self.y_ = y
       
        # training of the two KNNs
        self.KNN_CP31 = self.KNN_CP31.fit(self.X_[self.CPfeatureSet], self.y_)
        self.KNN_MFP = self.KNN_MFP.fit(self.X_.iloc[:, -1030:-6], self.y_)
        
        # we get the first kneighbor distances for both KNN and we create a df
        KNN_res = pd.DataFrame([self.KNN_CP31.kneighbors()[0].ravel(),
        self.X_.iloc[self.KNN_CP31.kneighbors()[1].ravel()].Tox60.values,  
        self.KNN_MFP.kneighbors()[0].ravel(),
        self.X_.iloc[self.KNN_MFP.kneighbors()[1].ravel()].Tox60.values, y]).T
        KNN_res.columns = ['d_CP', 'CP_pred', 'd_MFP', 'MFP_pred', 'Tox60']
        # subset when predictions of the two KNN do not agree
        KNN_res = KNN_res.query('CP_pred != MFP_pred')
        

        
        global mean_dist_training
        mean_dist_training = mean_dist_training + KNN_res.d_MFP.mean()

        
        # synthetic data
        KNN_res_more = KNN_res.copy()
        KNN_res_more = KNN_res_more.query('CP_pred == Tox60')
        random.seed(1)
        # we create synthetic data
        KNN_res_more.loc[:, 'd_MFP'] = [random.uniform(0.7, 0.9) for _ in range(KNN_res_more.shape [0])]
        # we add them to the training set
        KNN_res = pd.concat([KNN_res, KNN_res_more])
        
        
        # we fit the decision support classifier with the training set
        self.clf = self.clf.fit(KNN_res.iloc[:, :-1], KNN_res.iloc[:, -1])
        
        
        return self

    def predict(self, X):
  
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])


        # df with predictions and distances of the nearest neighbors for the two KNN
        KNN_res_test = pd.DataFrame([self.KNN_CP31.kneighbors(X[self.CPfeatureSet])[0].ravel(),
        self.KNN_CP31.predict(X[self.CPfeatureSet]),
        self.KNN_MFP.kneighbors(X.iloc[:, -1030:-6])[0].ravel(),
        self.KNN_MFP.predict(X.iloc[:, -1030:-6])]).T
        KNN_res_test.columns = ['d_CP', 'CP_pred', 'd_MFP', 'MFP_pred']
        
        # we create the prediction column which takes per default le Cell Painting KNN prediction
        KNN_res_test.loc[:, 'pred'] = KNN_res_test.loc[:, 'CP_pred'].values
        

        # case when the two KNN do not have the same predicions -> we use the decision support model
        if KNN_res_test.loc[KNN_res_test.CP_pred != KNN_res_test.MFP_pred, 'pred'].shape[0] > 0:
            KNN_res_test.loc[KNN_res_test.CP_pred != KNN_res_test.MFP_pred, 'pred'] = self.clf.predict(KNN_res_test.loc[KNN_res_test.CP_pred != KNN_res_test.MFP_pred, ['d_CP', 'CP_pred', 'd_MFP', 'MFP_pred']])
        
        global mean_dist_testing
        mean_dist_testing = mean_dist_testing + KNN_res_test.d_MFP.mean()



        return KNN_res_test.loc[:, 'pred'].values