#**Retraining models**


The selected model architectures were retrained on the complete data set for each target to enhance the Matthews Correlation Coefficient (MCC) and improve model generalization. This process reused the same hyperparameters as the initial models and focused on predicting the final antiviral activity class for the virtual screening (VS) data set. Cross-validation was conducted to evaluate retraining effectiveness, with MCC values serving as the primary performance metric.

**Note:** This notebook provides an example of the retraining models phase for the target IAV_Polymerase (PA).

##**1. Prepare the environment**



In [None]:
from IPython.utils import io
with io.capture_output() as captured:
  !pip install pycaret
  !pip install datamol
  !pip install rdkit
  !pip install deepchem
import pycaret
import os, os.path, sys, random, subprocess
import datamol as dm
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from pycaret.classification import *
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
import deepchem as dc
from imblearn.over_sampling import *
from sklearn.metrics import balanced_accuracy_score

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


###**1.1. Load and prepare the data set**

In [None]:
IAV_Poly = pd.read_csv("/content/IAV_Polymerase (PA)_molecules.csv")

In [None]:
IAV_Poly = IAV_Poly.drop(["molecule_chembl_id", "canonical_smiles_std", "unique_target"], axis=1)

In [None]:
IAV_Poly

Unnamed: 0,activity,mw,fsp3,n_lipinski_hba,n_lipinski_hbd,n_rotatable_bonds,clogp,n_aliphatic_carbocycles,n_aliphatic_heterocyles,n_aromatic_carbocycles,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,1.0,413.139386,0.304348,5,2,7,4.2605,0,1,2,...,0,0,0,0,0,0,0,0,0,0
1,1.0,419.186336,0.565217,5,2,7,4.6406,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1.0,413.139386,0.304348,5,2,7,4.2605,0,1,2,...,0,0,0,0,0,0,0,0,0,0
3,1.0,238.131742,0.583333,5,1,3,1.7714,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,1.0,408.110151,0.100000,8,2,5,3.0130,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,1.0,571.122478,0.296296,10,0,4,3.3912,0,3,2,...,0,0,0,0,0,0,0,0,0,0
252,0.0,291.029835,0.071429,5,1,4,3.3571,0,0,2,...,0,0,0,0,0,0,0,0,0,0
253,1.0,373.116152,0.263158,8,4,4,1.3489,0,1,2,...,0,0,0,0,0,0,0,0,0,0
254,1.0,389.058260,0.000000,8,3,1,1.9252,0,3,2,...,0,0,0,0,0,0,1,0,0,0


###**1.2. Load Models**

In [None]:
IAV_Poly_model_svm = load_model('IAV_Polymerase (PA)_svm')
IAV_Poly_model_et = load_model('IAV_Polymerase (PA)_et')
IAV_Poly_model_gbc = load_model('IAV_Polymerase (PA)_gbc')

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [None]:
IAV_Poly_model_svm.get_params()

{'memory': Memory(location=None),
 'steps': [('numerical_imputer',
   TransformerWrapper(include=['mw', 'fsp3', 'n_lipinski_hba', 'n_lipinski_hbd',
                               'n_rotatable_bonds', 'clogp',
                               'n_aliphatic_carbocycles',
                               'n_aliphatic_heterocyles', 'n_aromatic_carbocycles',
                               'n_aromatic_heterocyles', 'n_aromatic_rings',
                               'n_saturated_carbocycles',
                               'n_saturated_heterocyles', '0', '1', '2', '3', '4',
                               '5', '6', '7', '8', '9', '10', '11', '12', '13',
                               '14', '15', '16', ...],
                      transformer=SimpleImputer())),
  ('categorical_imputer',
   TransformerWrapper(include=[],
                      transformer=SimpleImputer(strategy='most_frequent'))),
  ('normalize', TransformerWrapper(transformer=StandardScaler())),
  ('actual_estimator',
   SGDClassifier

In [None]:
IAV_Poly_model_et.get_params()

{'memory': Memory(location=None),
 'steps': [('numerical_imputer',
   TransformerWrapper(include=['mw', 'fsp3', 'n_lipinski_hba', 'n_lipinski_hbd',
                               'n_rotatable_bonds', 'clogp',
                               'n_aliphatic_carbocycles',
                               'n_aliphatic_heterocyles', 'n_aromatic_carbocycles',
                               'n_aromatic_heterocyles', 'n_aromatic_rings',
                               'n_saturated_carbocycles',
                               'n_saturated_heterocyles', '0', '1', '2', '3', '4',
                               '5', '6', '7', '8', '9', '10', '11', '12', '13',
                               '14', '15', '16', ...],
                      transformer=SimpleImputer())),
  ('categorical_imputer',
   TransformerWrapper(include=[],
                      transformer=SimpleImputer(strategy='most_frequent'))),
  ('normalize', TransformerWrapper(transformer=StandardScaler())),
  ('actual_estimator', ExtraTreesClassi

In [None]:
IAV_Poly_model_gbc.get_params()

{'memory': Memory(location=None),
 'steps': [('numerical_imputer',
   TransformerWrapper(include=['mw', 'fsp3', 'n_lipinski_hba', 'n_lipinski_hbd',
                               'n_rotatable_bonds', 'clogp',
                               'n_aliphatic_carbocycles',
                               'n_aliphatic_heterocyles', 'n_aromatic_carbocycles',
                               'n_aromatic_heterocyles', 'n_aromatic_rings',
                               'n_saturated_carbocycles',
                               'n_saturated_heterocyles', '0', '1', '2', '3', '4',
                               '5', '6', '7', '8', '9', '10', '11', '12', '13',
                               '14', '15', '16', ...],
                      transformer=SimpleImputer())),
  ('categorical_imputer',
   TransformerWrapper(include=[],
                      transformer=SimpleImputer(strategy='most_frequent'))),
  ('normalize', TransformerWrapper(transformer=StandardScaler())),
  ('actual_estimator', GradientBoosting

##**2. Model custom parameters**

###**2.1 Support Vector Machine (SVM)**

In [None]:
# Create an instance of the ADASYN (Adaptive Synthetic Sampling) algorithm
# This technique generates synthetic samples for the minority class to balance the dataset
# 'sampling_strategy' is set to 'minority', meaning it will only oversample the minority class
adasyn1 = ADASYN(sampling_strategy = 'minority')

In [None]:
if 0.4 <= (IAV_Poly.activity.value_counts()[1] / len(IAV_Poly)) <= 0.6:

  IAV_Poly_model_svm = setup(data = IAV_Poly, target = "activity", session_id = 123, log_experiment = False,
                      normalize = True, fold_shuffle = True, fix_imbalance = False)

else:

  IAV_Poly_model_svm = setup(data = IAV_Poly, target = "activity", session_id = 123, log_experiment = False,
                      normalize = True, fold_shuffle = True, fix_imbalance = True,
                      fix_imbalance_method = adasyn1)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,activity
2,Target type,Binary
3,Original data shape,"(256, 2062)"
4,Transformed data shape,"(256, 2062)"
5,Transformed train set shape,"(179, 2062)"
6,Transformed test set shape,"(77, 2062)"
7,Numeric features,2061
8,Preprocess,True
9,Imputation type,simple


In [None]:
add_metric("B. Accuracy", "BA", balanced_accuracy_score)

Unnamed: 0,B._Accuracy
Name,BA
Display Name,BA
Score Function,<pycaret.internal.metrics.EncodedDecodedLabels...
Scorer,"make_scorer(balanced_accuracy_score, response_..."
Target,pred
Args,{}
Greater is Better,True
Multiclass,True
Custom,True


In [None]:
# Retrain with predefined parameters 'actual_estimator'
IAV_Poly_model_svm = create_model('svm',alpha= 0.05,
 average= False,
 class_weight= None,
 early_stopping= False,
 epsilon= 0.1,
 eta0= 0.001,
 fit_intercept= True,
 l1_ratio= 0.3900000001,
 learning_rate= 'adaptive',
 loss= 'hinge',
 max_iter= 1000,
 n_iter_no_change= 5,
 n_jobs= -1,
 penalty= 'l1',
 power_t= 0.5,
 random_state= 123,
 shuffle= True,
 tol= 0.001,
 validation_fraction= 0.1,
 verbose= False,
 warm_start= False)

In [None]:
# Finalize model
final_model_IAV_Poly_svm = finalize_model(IAV_Poly_model_svm)

In [None]:
# Save model
save_model(IAV_Poly_model_svm, "IAV_Poly_model_svm_final")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['mw', 'fsp3', 'n_lipinski_hba',
                                              'n_lipinski_hbd',
                                              'n_rotatable_bonds', 'clogp',
                                              'n_aliphatic_carbocycles',
                                              'n_aliphatic_heterocyles',
                                              'n_aromatic_carbocycles',
                                              'n_aromatic_heterocyles',
                                              'n_aromatic_rings',
                                              'n_saturated_carbocycles',
                                              'n_saturated_heterocyl...
                  SGDClassifier(alpha=0.05, average=False, class_weight=None,
                                early_stopping=False, epsilon=0.1, eta0=0

In [None]:
statistics_final_model_IAV_Poly_svm = pull()

In [None]:
statistics_final_model_IAV_Poly_svm

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9444,0.9625,0.9,1.0,0.9474,0.8889,0.8944,0.95
1,0.8333,0.95,0.9,0.8182,0.8571,0.6582,0.6625,0.825
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.8889,0.9481,1.0,0.8462,0.9167,0.7534,0.7774,0.8571
4,0.8333,0.7662,0.8182,0.9,0.8571,0.6582,0.6625,0.8377
5,0.8889,0.8961,0.9091,0.9091,0.9091,0.7662,0.7662,0.8831
6,0.8889,0.9091,0.9091,0.9091,0.9091,0.7662,0.7662,0.8831
7,0.7778,0.8052,1.0,0.7333,0.8462,0.4783,0.5606,0.7143
8,0.8333,0.7792,0.9091,0.8333,0.8696,0.64,0.6447,0.8117
9,0.8824,1.0,1.0,0.8333,0.9091,0.7463,0.7715,0.8571


In [None]:
statistics_final_model_IAV_Poly_svm.to_csv(f"IAV_Poly_statistics_svm_final.csv", index = False)

###**2.2 Gradient Boost Classifier**

In [None]:
adasyn1 = ADASYN(sampling_strategy = 'minority')

In [None]:
if 0.4 <= (IAV_Poly.activity.value_counts()[1] / len(IAV_Poly)) <= 0.6:

  IAV_Poly_model_gbc = setup(data = IAV_Poly, target = "activity", session_id = 123, log_experiment = False,
                      normalize = True, fold_shuffle = True, fix_imbalance = False)

else:

  IAV_Poly_model_gbc = setup(data = IAV_Poly, target = "activity", session_id = 123, log_experiment = False,
                      normalize = True, fold_shuffle = True, fix_imbalance = True,
                      fix_imbalance_method = adasyn1)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,activity
2,Target type,Binary
3,Original data shape,"(256, 2062)"
4,Transformed data shape,"(256, 2062)"
5,Transformed train set shape,"(179, 2062)"
6,Transformed test set shape,"(77, 2062)"
7,Numeric features,2061
8,Preprocess,True
9,Imputation type,simple


In [None]:
add_metric("B. Accuracy", "BA", balanced_accuracy_score)

Unnamed: 0,B._Accuracy
Name,BA
Display Name,BA
Score Function,<pycaret.internal.metrics.EncodedDecodedLabels...
Scorer,"make_scorer(balanced_accuracy_score, response_..."
Target,pred
Args,{}
Greater is Better,True
Multiclass,True
Custom,True


In [None]:
# Retrain with predefined parameters 'actual_estimator'
IAV_Poly_model_gbc = create_model('gbc', ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='log_loss', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=100, n_iter_no_change=None,
                            random_state=123, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=False,
                            warm_start=False)

In [None]:
# Finalize model
final_model_IAV_Poly_gbc = finalize_model(IAV_Poly_model_gbc)

In [None]:
# save model
save_model(IAV_Poly_model_gbc, "IAV_Poly_model_gbc_final")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['mw', 'fsp3', 'n_lipinski_hba',
                                              'n_lipinski_hbd',
                                              'n_rotatable_bonds', 'clogp',
                                              'n_aliphatic_carbocycles',
                                              'n_aliphatic_heterocyles',
                                              'n_aromatic_carbocycles',
                                              'n_aromatic_heterocyles',
                                              'n_aromatic_rings',
                                              'n_saturated_carbocycles',
                                              'n_saturated_heterocyl...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss=

In [None]:
statistics_final_model_IAV_Poly_gbc = pull()

In [None]:
statistics_final_model_IAV_Poly_gbc

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8889,0.9375,0.9,0.9,0.9,0.775,0.775,0.8875
1,0.8889,0.9875,0.9,0.9,0.9,0.775,0.775,0.8875
2,0.9444,0.9875,1.0,0.9091,0.9524,0.8861,0.8919,0.9375
3,0.8889,0.9351,0.9091,0.9091,0.9091,0.7662,0.7662,0.8831
4,0.8333,0.8312,0.8182,0.9,0.8571,0.6582,0.6625,0.8377
5,0.8889,0.9351,0.9091,0.9091,0.9091,0.7662,0.7662,0.8831
6,0.8889,0.9481,0.9091,0.9091,0.9091,0.7662,0.7662,0.8831
7,0.7778,0.7403,1.0,0.7333,0.8462,0.4783,0.5606,0.7143
8,0.7778,0.8052,0.9091,0.7692,0.8333,0.5068,0.523,0.7403
9,0.8824,0.8857,1.0,0.8333,0.9091,0.7463,0.7715,0.8571


In [None]:
statistics_final_model_IAV_Poly_gbc.to_csv(f"IAV_Poly_statistics_gbc_final.csv", index = False)

###**2.3. Extra Trees Classifier**

In [None]:
adasyn1 = ADASYN(sampling_strategy = 'minority')

In [None]:
if 0.4 <= (IAV_Poly.activity.value_counts()[1] / len(IAV_Poly)) <= 0.6:

  IAV_Poly_model_et = setup(data = IAV_Poly, target = "activity", session_id = 123, log_experiment = False,
                      normalize = True, fold_shuffle = True, fix_imbalance = False)

else:

  IAV_Poly_model_et = setup(data = IAV_Poly, target = "activity", session_id = 123, log_experiment = False,
                      normalize = True, fold_shuffle = True, fix_imbalance = True,
                      fix_imbalance_method = adasyn1)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,activity
2,Target type,Binary
3,Original data shape,"(256, 2062)"
4,Transformed data shape,"(256, 2062)"
5,Transformed train set shape,"(179, 2062)"
6,Transformed test set shape,"(77, 2062)"
7,Numeric features,2061
8,Preprocess,True
9,Imputation type,simple


In [None]:
add_metric("B. Accuracy", "BA", balanced_accuracy_score)

Unnamed: 0,B._Accuracy
Name,BA
Display Name,BA
Score Function,<pycaret.internal.metrics.EncodedDecodedLabels...
Scorer,"make_scorer(balanced_accuracy_score, response_..."
Target,pred
Args,{}
Greater is Better,True
Multiclass,True
Custom,True


In [None]:
# Retrain with predefined parameters 'actual_estimator'
IAV_Poly_model_et = create_model('et',
 bootstrap= False,
 ccp_alpha= 0.0,
 class_weight= None,
 criterion= 'gini',
 max_depth= None,
 max_features= 'sqrt',
 max_leaf_nodes= None,
 max_samples= None,
 min_impurity_decrease= 0.0,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 monotonic_cst= None,
 n_estimators= 100,
 n_jobs= -1,
 oob_score= False,
 random_state= 123,
 verbose= False,
 warm_start= False)

In [None]:
# Finalize model
final_model_IAV_Poly_et = finalize_model(IAV_Poly_model_et)

In [None]:
# save model
save_model(IAV_Poly_model_et, "IAV_Poly_model_et_final")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['mw', 'fsp3', 'n_lipinski_hba',
                                              'n_lipinski_hbd',
                                              'n_rotatable_bonds', 'clogp',
                                              'n_aliphatic_carbocycles',
                                              'n_aliphatic_heterocyles',
                                              'n_aromatic_carbocycles',
                                              'n_aromatic_heterocyles',
                                              'n_aromatic_rings',
                                              'n_saturated_carbocycles',
                                              'n_saturated_heterocyl...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
    

In [None]:
statistics_final_model_IAV_Poly_et = pull()

In [None]:
statistics_final_model_IAV_Poly_et

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9444,0.9188,0.9,1.0,0.9474,0.8889,0.8944,0.95
1,0.8889,0.9625,0.9,0.9,0.9,0.775,0.775,0.8875
2,0.9444,1.0,1.0,0.9091,0.9524,0.8861,0.8919,0.9375
3,0.9444,0.974,1.0,0.9167,0.9565,0.88,0.8864,0.9286
4,0.8333,0.8312,0.8182,0.9,0.8571,0.6582,0.6625,0.8377
5,0.8333,0.8961,0.8182,0.9,0.8571,0.6582,0.6625,0.8377
6,0.8889,0.9221,0.9091,0.9091,0.9091,0.7662,0.7662,0.8831
7,0.7778,0.7468,1.0,0.7333,0.8462,0.4783,0.5606,0.7143
8,0.8333,0.9481,0.9091,0.8333,0.8696,0.64,0.6447,0.8117
9,0.8824,1.0,1.0,0.8333,0.9091,0.7463,0.7715,0.8571


In [None]:
statistics_final_model_IAV_Poly_et.to_csv(f"IAV_Poly_statistics_et_final.csv", index = False)