#**Consensus model**

For each target, the three top-performing ML, selected and retrained, were combined to generate a consensus model. These consensus models underwent internal validation through cross-validation to calculate the MCC value using Scikit-Learn functions, enabling performance comparison with each individual model.

**Note**: This notebook provides an example of the consensus model construction for the target IAV_Polymerase (PA).

##**1. Prepare the environment**


In [None]:
from IPython.utils import io
with io.capture_output() as captured:
  !pip install pycaret
  !pip install datamol
  !pip install rdkit
import pycaret
import os, os.path, sys, random, subprocess
import datamol as dm
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from pycaret.classification import *
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.metrics import matthews_corrcoef

In [None]:
pycaret.__version__

'3.3.2'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###**1.1. Load and prepare the 'test' data set**

In [None]:
db_predict = pd.read_csv("/content/drive/MyDrive/antivirals_machine_learning/Notebooks/model_test_2048/1_IAV_Polymerase (PA)/1_data_sets/test_IAV_Polymerase (PA).csv")
db_predict

Unnamed: 0,molecule_chembl_id,canonical_smiles_std,unique_target,activity,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,CHEMBL4452342,CN(Cc1cc(=O)c(O)cn1-c1ccc(-c2nnn[nH]2)cc1)c1cc...,IAV_Polymerase (PA),1.0,408.110151,0.1,8,2,5,3.013,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL4444029,Cc1cc(-c2noc(=O)[nH]2)ccc1-c1cc(=O)c(O)c(C(=O)...,IAV_Polymerase (PA),1.0,329.064785,0.066667,9,4,3,1.09742,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL4440338,O=C(O)c1cccc(-c2cc(=O)c(O)c(C(=O)O)[nH]2)c1,IAV_Polymerase (PA),1.0,275.042987,0.0,7,4,3,1.1439,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL4440123,O=C(O)c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2)cc(=O)c1O,IAV_Polymerase (PA),1.0,299.065454,0.0,9,4,3,0.6258,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4555466,Cc1cc(-c2nnn[nH]2)ccc1-c1cc(=O)c(O)c(C(=O)O)[nH]1,IAV_Polymerase (PA),1.0,313.081104,0.071429,9,4,3,0.93422,...,0,0,0,0,0,0,0,0,0,0
5,CHEMBL4552510,Cn1cc(O)c(=O)cc1CO,IAV_Polymerase (PA),1.0,155.058243,0.285714,4,2,1,-0.4168,...,0,0,0,0,0,0,0,0,0,0
6,CHEMBL4551356,O=C(O)c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2C(F)(F)F)c...,IAV_Polymerase (PA),1.0,367.052838,0.071429,9,4,3,1.6446,...,0,0,0,0,0,0,0,0,0,0
7,CHEMBL2070214,O=C(NCCc1ccc(O)c(O)c1)c1cc2cc(O)c(O)cc2[nH]1,IAV_Polymerase (PA),1.0,328.105922,0.117647,7,6,4,1.9628,...,0,0,0,0,0,0,0,0,0,0
8,CHEMBL4438546,O=C(O)c1[nH]c(-c2cccc(Oc3ccccc3)c2)cc(=O)c1O,IAV_Polymerase (PA),1.0,323.079373,0.0,6,3,4,3.238,...,0,0,0,0,0,0,0,0,0,0
9,CHEMBL4537957,CN(Cc1cc(=O)c(O)co1)c1ccc(Cl)cc1,IAV_Polymerase (PA),1.0,265.050571,0.153846,4,1,3,2.6352,...,0,0,0,0,0,0,0,0,0,0


###**1.2. Load Models**

In [None]:
iavpoly_et = load_model('IAV_Polymerase (PA)_et')
iavpoly_gbc = load_model('IAV_Polymerase (PA)_gbc')
iavpoly_svm = load_model('IAV_Polymerase (PA)_svm')
models = [iavpoly_et,
iavpoly_gbc,
iavpoly_svm]

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [None]:
data = db_predict.drop(["canonical_smiles_std", "unique_target", "activity"], axis=1)
data

Unnamed: 0,molecule_chembl_id,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,CHEMBL4452342,408.110151,0.1,8,2,5,3.013,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL4444029,329.064785,0.066667,9,4,3,1.09742,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL4440338,275.042987,0.0,7,4,3,1.1439,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL4440123,299.065454,0.0,9,4,3,0.6258,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4555466,313.081104,0.071429,9,4,3,0.93422,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,CHEMBL4552510,155.058243,0.285714,4,2,1,-0.4168,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,CHEMBL4551356,367.052838,0.071429,9,4,3,1.6446,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,CHEMBL2070214,328.105922,0.117647,7,6,4,1.9628,0,0,2,...,0,0,0,0,0,0,0,0,0,0
8,CHEMBL4438546,323.079373,0.0,6,3,4,3.238,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,CHEMBL4537957,265.050571,0.153846,4,1,3,2.6352,0,0,1,...,0,0,0,0,0,0,0,0,0,0


##**2. Predictions for each model**

In [None]:
# Use the predict_model function to generate predictions for the model 'iavpoly_et'

prediction = predict_model(iavpoly_et, data = data)
prediction.rename(columns = {'prediction_label':'label_et', 'prediction_score':'score_et'}, inplace = True)
prediction.head()

Unnamed: 0,molecule_chembl_id,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2040,2041,2042,2043,2044,2045,2046,2047,label_et,score_et
0,CHEMBL4452342,408.110138,0.1,8,2,5,3.013,0,0,2,...,0,0,0,0,0,0,0,0,1,0.84
1,CHEMBL4444029,329.064789,0.066667,9,4,3,1.09742,0,0,1,...,0,0,0,0,0,0,0,0,1,0.84
2,CHEMBL4440338,275.042999,0.0,7,4,3,1.1439,0,0,1,...,0,0,0,0,0,0,0,0,1,0.94
3,CHEMBL4440123,299.06546,0.0,9,4,3,0.6258,0,0,1,...,0,0,0,0,0,0,0,0,1,0.91
4,CHEMBL4555466,313.081116,0.071429,9,4,3,0.93422,0,0,1,...,0,0,0,0,0,0,0,0,1,0.86


In [None]:
# Use the predict_model function to generate predictions for the model 'iavpoly_gbc'
prediction2 = predict_model(iavpoly_gbc, data = prediction)
prediction2.rename(columns = {'prediction_label':'label_gbc', 'prediction_score':'score_gbc'}, inplace = True)
prediction2.head()

Unnamed: 0,molecule_chembl_id,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2042,2043,2044,2045,2046,2047,label_et,score_et,label_gbc,score_gbc
0,CHEMBL4452342,408.110138,0.1,8,2,5,3.013,0,0,2,...,0,0,0,0,0,0,1,0.84,1,0.9836
1,CHEMBL4444029,329.064789,0.066667,9,4,3,1.09742,0,0,1,...,0,0,0,0,0,0,1,0.84,1,0.9434
2,CHEMBL4440338,275.042999,0.0,7,4,3,1.1439,0,0,1,...,0,0,0,0,0,0,1,0.94,1,0.9532
3,CHEMBL4440123,299.06546,0.0,9,4,3,0.6258,0,0,1,...,0,0,0,0,0,0,1,0.91,1,0.9298
4,CHEMBL4555466,313.081116,0.071429,9,4,3,0.93422,0,0,1,...,0,0,0,0,0,0,1,0.86,1,0.9434


In [None]:
# Use the predict_model function to generate predictions for the model 'iavpoly_svm'
prediction3 = predict_model(iavpoly_svm, data = prediction2)
prediction3.rename(columns = {'prediction_label':'label_svm', 'prediction_score':'score_svm'}, inplace = True)
prediction3.head()

Unnamed: 0,molecule_chembl_id,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2043,2044,2045,2046,2047,label_et,score_et,label_gbc,score_gbc,label_svm
0,CHEMBL4452342,408.110138,0.1,8,2,5,3.013,0,0,2,...,0,0,0,0,0,1,0.84,1,0.9836,1
1,CHEMBL4444029,329.064789,0.066667,9,4,3,1.09742,0,0,1,...,0,0,0,0,0,1,0.84,1,0.9434,1
2,CHEMBL4440338,275.042999,0.0,7,4,3,1.1439,0,0,1,...,0,0,0,0,0,1,0.94,1,0.9532,1
3,CHEMBL4440123,299.06546,0.0,9,4,3,0.6258,0,0,1,...,0,0,0,0,0,1,0.91,1,0.9298,1
4,CHEMBL4555466,313.081116,0.071429,9,4,3,0.93422,0,0,1,...,0,0,0,0,0,1,0.86,1,0.9434,1


##**3. Consensus**

In [None]:
# Create a new column 'consensus' by summing the values of three label columns
# 'label_et', 'label_gbc', and 'label_svm' must contain numeric data (e.g., integers like 0 or 1)
prediction3['consensus'] = prediction3['label_et'] + prediction3['label_gbc'] + prediction3['label_svm']
prediction3.head()

Unnamed: 0,molecule_chembl_id,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2044,2045,2046,2047,label_et,score_et,label_gbc,score_gbc,label_svm,consensus
0,CHEMBL4452342,408.110138,0.1,8,2,5,3.013,0,0,2,...,0,0,0,0,1,0.84,1,0.9836,1,3
1,CHEMBL4444029,329.064789,0.066667,9,4,3,1.09742,0,0,1,...,0,0,0,0,1,0.84,1,0.9434,1,3
2,CHEMBL4440338,275.042999,0.0,7,4,3,1.1439,0,0,1,...,0,0,0,0,1,0.94,1,0.9532,1,3
3,CHEMBL4440123,299.06546,0.0,9,4,3,0.6258,0,0,1,...,0,0,0,0,1,0.91,1,0.9298,1,3
4,CHEMBL4555466,313.081116,0.071429,9,4,3,0.93422,0,0,1,...,0,0,0,0,1,0.86,1,0.9434,1,3


In [None]:
# Create a new column 'norm_consensus' where:
# - If the 'consensus' value equals 3, the value of 'norm_consensus' is set to 1
# - Otherwise, it is set to 0
prediction3['norm_consensus'] = np.where(prediction3['consensus'] == 3, 1, 0)
prediction3

Unnamed: 0,molecule_chembl_id,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2045,2046,2047,label_et,score_et,label_gbc,score_gbc,label_svm,consensus,norm_consensus
0,CHEMBL4452342,408.110138,0.1,8,2,5,3.013,0,0,2,...,0,0,0,1,0.84,1,0.9836,1,3,1
1,CHEMBL4444029,329.064789,0.066667,9,4,3,1.09742,0,0,1,...,0,0,0,1,0.84,1,0.9434,1,3,1
2,CHEMBL4440338,275.042999,0.0,7,4,3,1.1439,0,0,1,...,0,0,0,1,0.94,1,0.9532,1,3,1
3,CHEMBL4440123,299.06546,0.0,9,4,3,0.6258,0,0,1,...,0,0,0,1,0.91,1,0.9298,1,3,1
4,CHEMBL4555466,313.081116,0.071429,9,4,3,0.93422,0,0,1,...,0,0,0,1,0.86,1,0.9434,1,3,1
5,CHEMBL4552510,155.058243,0.285714,4,2,1,-0.4168,0,0,0,...,0,0,0,1,0.89,1,0.954,1,3,1
6,CHEMBL4551356,367.052826,0.071429,9,4,3,1.6446,0,0,1,...,0,0,0,1,0.88,1,0.9489,1,3,1
7,CHEMBL2070214,328.105927,0.117647,7,6,4,1.9628,0,0,2,...,0,0,0,1,0.6,1,0.7692,1,3,1
8,CHEMBL4438546,323.079376,0.0,6,3,4,3.238,0,0,2,...,0,0,0,1,0.96,1,0.9671,1,3,1
9,CHEMBL4537957,265.050568,0.153846,4,1,3,2.6352,0,0,1,...,0,0,0,1,0.94,1,0.9579,1,3,1


In [None]:
# Merge two DataFrames, db_predict and prediction3, on the column 'molecule_chembl_id'
test = pd.merge(db_predict, prediction3, on = 'molecule_chembl_id')
test.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles_std,unique_target,activity,mw_x,fsp3_x,n_lipinski_hba_x,n_lipinski_hbd_x,n_rotatable_bonds_x,clogp_x,...,2045_y,2046_y,2047_y,label_et,score_et,label_gbc,score_gbc,label_svm,consensus,norm_consensus
0,CHEMBL4452342,CN(Cc1cc(=O)c(O)cn1-c1ccc(-c2nnn[nH]2)cc1)c1cc...,IAV_Polymerase (PA),1.0,408.110151,0.1,8,2,5,3.013,...,0,0,0,1,0.84,1,0.9836,1,3,1
1,CHEMBL4444029,Cc1cc(-c2noc(=O)[nH]2)ccc1-c1cc(=O)c(O)c(C(=O)...,IAV_Polymerase (PA),1.0,329.064785,0.066667,9,4,3,1.09742,...,0,0,0,1,0.84,1,0.9434,1,3,1
2,CHEMBL4440338,O=C(O)c1cccc(-c2cc(=O)c(O)c(C(=O)O)[nH]2)c1,IAV_Polymerase (PA),1.0,275.042987,0.0,7,4,3,1.1439,...,0,0,0,1,0.94,1,0.9532,1,3,1
3,CHEMBL4440123,O=C(O)c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2)cc(=O)c1O,IAV_Polymerase (PA),1.0,299.065454,0.0,9,4,3,0.6258,...,0,0,0,1,0.91,1,0.9298,1,3,1
4,CHEMBL4555466,Cc1cc(-c2nnn[nH]2)ccc1-c1cc(=O)c(O)c(C(=O)O)[nH]1,IAV_Polymerase (PA),1.0,313.081104,0.071429,9,4,3,0.93422,...,0,0,0,1,0.86,1,0.9434,1,3,1


In [None]:
test

Unnamed: 0,molecule_chembl_id,canonical_smiles_std,unique_target,activity,mw_x,fsp3_x,n_lipinski_hba_x,n_lipinski_hbd_x,n_rotatable_bonds_x,clogp_x,...,2045_y,2046_y,2047_y,label_et,score_et,label_gbc,score_gbc,label_svm,consensus,norm_consensus
0,CHEMBL4452342,CN(Cc1cc(=O)c(O)cn1-c1ccc(-c2nnn[nH]2)cc1)c1cc...,IAV_Polymerase (PA),1.0,408.110151,0.1,8,2,5,3.013,...,0,0,0,1,0.84,1,0.9836,1,3,1
1,CHEMBL4444029,Cc1cc(-c2noc(=O)[nH]2)ccc1-c1cc(=O)c(O)c(C(=O)...,IAV_Polymerase (PA),1.0,329.064785,0.066667,9,4,3,1.09742,...,0,0,0,1,0.84,1,0.9434,1,3,1
2,CHEMBL4440338,O=C(O)c1cccc(-c2cc(=O)c(O)c(C(=O)O)[nH]2)c1,IAV_Polymerase (PA),1.0,275.042987,0.0,7,4,3,1.1439,...,0,0,0,1,0.94,1,0.9532,1,3,1
3,CHEMBL4440123,O=C(O)c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2)cc(=O)c1O,IAV_Polymerase (PA),1.0,299.065454,0.0,9,4,3,0.6258,...,0,0,0,1,0.91,1,0.9298,1,3,1
4,CHEMBL4555466,Cc1cc(-c2nnn[nH]2)ccc1-c1cc(=O)c(O)c(C(=O)O)[nH]1,IAV_Polymerase (PA),1.0,313.081104,0.071429,9,4,3,0.93422,...,0,0,0,1,0.86,1,0.9434,1,3,1
5,CHEMBL4552510,Cn1cc(O)c(=O)cc1CO,IAV_Polymerase (PA),1.0,155.058243,0.285714,4,2,1,-0.4168,...,0,0,0,1,0.89,1,0.954,1,3,1
6,CHEMBL4551356,O=C(O)c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2C(F)(F)F)c...,IAV_Polymerase (PA),1.0,367.052838,0.071429,9,4,3,1.6446,...,0,0,0,1,0.88,1,0.9489,1,3,1
7,CHEMBL2070214,O=C(NCCc1ccc(O)c(O)c1)c1cc2cc(O)c(O)cc2[nH]1,IAV_Polymerase (PA),1.0,328.105922,0.117647,7,6,4,1.9628,...,0,0,0,1,0.6,1,0.7692,1,3,1
8,CHEMBL4438546,O=C(O)c1[nH]c(-c2cccc(Oc3ccccc3)c2)cc(=O)c1O,IAV_Polymerase (PA),1.0,323.079373,0.0,6,3,4,3.238,...,0,0,0,1,0.96,1,0.9671,1,3,1
9,CHEMBL4537957,CN(Cc1cc(=O)c(O)co1)c1ccc(Cl)cc1,IAV_Polymerase (PA),1.0,265.050571,0.153846,4,1,3,2.6352,...,0,0,0,1,0.94,1,0.9579,1,3,1


##**4. Get MCC metric for each model and consensus**

In [None]:
sklearn.metrics.matthews_corrcoef(test['activity'], test['label_et'])

0.7858565465071591

In [None]:
sklearn.metrics.matthews_corrcoef(test['activity'], test['label_gbc'])

0.6791621759648142

In [None]:
sklearn.metrics.matthews_corrcoef(test['activity'], test['label_svm'])

0.7326172182611423

In [None]:
sklearn.metrics.matthews_corrcoef(test['activity'], test['norm_consensus'])

0.750971515165386