In [1]:
import pandas as pd
import mtx.standardization as stand
import mtx.inference as inference
import mtx.evaluation as evaluation

# Load data

Load data on which the models should be applied. \
As an example, we loaded here the mitochondrial toxicity dataset provided in the supporting information. Due to storage limits, the data is not uploaded in the repository and should be downloaded from the supporting information directly.\
The Tox21 test and score sets will be used here as test set.

In [2]:
data = pd.read_excel('data/mitotox_dataset.xlsx')
data.head()

Unnamed: 0,canonical_smiles,MTX_overall,hits_overall,reports_overall,sources_overall,MTX_membrane_potential,hits_membrane_potential,reports_membrane_potential,sources_membrane_potential,MTX_respiratory_chain,...,cddd_502,cddd_503,cddd_504,cddd_505,cddd_506,cddd_507,cddd_508,cddd_509,cddd_510,cddd_511
0,Br,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,...,-0.016279,-0.009312,0.03078,0.336856,0.304795,-0.327309,-0.072511,0.184097,0.143712,-0.06772
1,BrC(Br)Br,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,...,0.220828,0.383945,0.323335,0.148923,0.981125,-0.516073,0.143063,-0.152397,0.71108,-0.437574
2,BrC(Br)C(Br)(Br)Br,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,...,0.207495,-0.019389,0.030406,0.37029,0.953231,-0.316175,0.266123,0.120671,0.512732,-0.280449
3,BrC(Br)C(Br)Br,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,...,0.234367,0.210632,0.178046,0.367158,0.981063,-0.423447,0.265576,-0.152512,0.652301,-0.348469
4,BrCBr,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,...,0.031353,0.108507,-0.112769,0.439568,0.855695,-0.456629,0.120328,-0.234823,0.479496,-0.110281


# Standardize molecules

Prepare the structures following the same procedure as for the training compounds.

Note: the provided dataset already has standardized SMILES. This is an example on how to do it for other datasets. \
Therefore, we first need to rename the "canonical_smiles" column to avoid it having the same name as the output column with the standardized SMILES.

In [7]:
data = data.rename({'canonical_smiles': 'smiles'}, axis=1)
data = stand.prepare_structures(data, smiles_column='smiles')
data.head()

RDKit ERROR: [10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16
[10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16

RDKit ERROR: 
RDKit ERROR: [10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16
RDKit ERROR: 
[10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16

RDKit ERROR: [10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16
RDKit ERROR: 
[10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16

RDKit ERROR: [10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16
RDKit ERROR: 
[10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16

RDKit ERROR: [10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16
RDKit ERROR: 
[10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16

RDKit ERROR: [10:27:13] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6 7 12 13 16
RDKit ERROR: 
[10:27:13] Can't kekulize mol.  Unkekuliz

Unnamed: 0,MTX_overall,hits_overall,reports_overall,sources_overall,MTX_membrane_potential,hits_membrane_potential,reports_membrane_potential,sources_membrane_potential,MTX_respiratory_chain,hits_respiratory_chain,...,cddd_503,cddd_504,cddd_505,cddd_506,cddd_507,cddd_508,cddd_509,cddd_510,cddd_511,canonical_smiles
0,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,,...,-0.009312,0.03078,0.336856,0.304795,-0.327309,-0.072511,0.184097,0.143712,-0.06772,Br
1,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,,...,0.383945,0.323335,0.148923,0.981125,-0.516073,0.143063,-0.152397,0.71108,-0.437574,BrC(Br)Br
2,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,,...,-0.019389,0.030406,0.37029,0.953231,-0.316175,0.266123,0.120671,0.512732,-0.280449,BrC(Br)C(Br)(Br)Br
3,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,,...,0.210632,0.178046,0.367158,0.981063,-0.423447,0.265576,-0.152512,0.652301,-0.348469,BrC(Br)C(Br)Br
4,0.0,0.0,1.0,['tox21'],0.0,0.0,1.0,['tox21'],,,...,0.108507,-0.112769,0.439568,0.855695,-0.456629,0.120328,-0.234823,0.479496,-0.110281,BrCBr


# Make predictions with the mechanistic models on the external test set

To make predictions on a test set, we need to specify the location of the models, as well as the Xtest (numpy array), containing the CDDDs used as input for the model. \
(The ytest and smiles variables are only needed for the evaluation of the predictions. If they are not present in the dataset, you can pass get_y=False and get_smiles=False to the get_X_y_smiles function.)

In [8]:
MODEL_PATHS = {'overall': 'models/model_state_overall_nn_cddd.pt',
              'membrane_potential': 'models/model_state_membrane_potential_nn_cddd.pt',
              'respiratory_chain': 'models/model_respiratory_chain_rf_cddd.pkl',
              'function_mitochondria': 'models/model_function_mitochondria_rf_cddd.pkl'}

In [9]:
test_column = 'MTX_tox21_test_score'
Xtest, ytest, smiles = inference.get_X_y_smiles(data, task=test_column, smiles_column='canonical_smiles')
Xtest[:5]

array([[-0.65323353, -0.07553566, -0.45833933, ..., -0.5730719 ,
         0.35890555, -0.3588241 ],
       [-0.43379283, -0.56074095, -0.19669497, ..., -0.29020661,
         0.74949884,  0.37097219],
       [-0.64642137, -0.56313127, -0.03349761, ..., -0.12487506,
         0.77700698,  0.52673244],
       [ 0.71724141,  0.09754813, -0.76377118, ..., -0.21773422,
         0.92585152,  0.37115654],
       [ 0.56464869, -0.09431393, -0.49272192, ...,  0.1225606 ,
         0.82380748,  0.08901246]])

In [11]:
predictions = {}
for task in ['overall', 'membrane_potential', 'respiratory_chain', 'function_mitochondria']:
    predictions[task] = inference.predict(Xtest, MODEL_PATHS[task], task)
    display(predictions[task][:5])

array([8.0110137e-13, 5.4996843e-07, 2.3691435e-10, 2.8727090e-04,
       3.7967056e-04], dtype=float32)

array([4.0854725e-13, 8.1891576e-06, 9.7424004e-07, 6.1026776e-06,
       2.2201550e-05], dtype=float32)

array([0.08202099, 0.29996331, 0.06756278, 0.12196771, 0.39895193])

array([0.07746218, 0.45802594, 0.06000812, 0.20961048, 0.60845948])

# Evaluate predictions from overall and membrane potential models

For illustration purposes we made predictions with the models for the four endpoints. However, the only two endpoints that should be able to predict this test set are the 'overall' and 'membrane potential', as the other two are related to different mechanisms of action.\
Therefore we evaluate the model on these tow endpoints.

In [12]:
for task in ['overall', 'membrane_potential']:
    print(task)
    results = evaluation.evaluate_predictions(ytest, predictions[task])
    display(results.round(2))
    print()

overall


Unnamed: 0,AUC,MCC,F1 score,Precision,Recall,Specificity
0,0.91,0.48,0.54,0.45,0.68,0.88



membrane_potential


Unnamed: 0,AUC,MCC,F1 score,Precision,Recall,Specificity
0,0.93,0.57,0.62,0.59,0.66,0.94



