# **1) Initiall instructions**

In [1]:
!pip install rdkit
!pip install optuna
!pip install catboost

Collecting rdkit
  Downloading rdkit-2024.9.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.1-cp310-cp310-manylinux_2_28_x86_64.whl (33.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.3/33.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.1
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K  

In [21]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import optuna

from rdkit import Chem
from rdkit.Chem import RDKFingerprint
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, matthews_corrcoef

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **2) Data preparation**

Due to the fact that several ligands did not show the correct binding mode in the molecular docking assay and the fact that several molecules were not docked at all by the algorithm, there is a need to adapt fingerprints-based dataset to the dataset used for the CNNdock estimator training.

In [26]:
data = pd.read_csv('filepath_1') #path to file with parsed informations (especially bioactivity and smiles) from CHEMBL database
df = pd.DataFrame(data)

In [5]:
used_ligands = pd.read_csv('filepath_2', header=None) #path to file with ligands used for CNNdock model training
list_of_used_ligands = used_ligands[0].to_list()
list_of_used_ligands = [entry.replace('_', '') for entry in list_of_used_ligands]

In [6]:
df = df[df['Molecule_ChEMBL_ID'].isin(list_of_used_ligands)]
df['Activity'] = np.where(df['Standard_Value'] <= 50, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Activity'] = np.where(df['Standard_Value'] <= 50, 1, 0)


In [7]:
mols = []
for smiles in df['Smiles']:
  mol = Chem.MolFromSmiles(smiles)
  if mol is not None:
    mols.append(mol)

morgan_gen = GetMorganGenerator(radius=2, fpSize=2048)
fps = [morgan_gen.GetFingerprint(mol) for mol in mols]
df['Fingerprints'] = fps

In [8]:
df['Fingerprints'] = df['Fingerprints'].apply(lambda x: x.ToBitString())

In [9]:
x = []
y = []

for x_elem, y_elem in zip(df['Fingerprints'], df['Activity']):
  x.append(x_elem)
  y.append(y_elem)

x = np.array(x)
y = np.array(y)

x_array = np.array([list(map(int, fp)) for fp in x])

# **3) Estimators selection**

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [22]:
models_list = [RandomForestClassifier(random_state=42),
               SVC(random_state=42),
               LogisticRegression(random_state=42),
               CatBoostClassifier(random_state=42, verbose=0)]

rfc_accuracies = []
svc_accuracies = []
lr_accuracies = []
cat_accuracies = []

rfc_mcc = []
svc_mcc = []
lr_mcc = []
cat_mcc = []

for elem in models_list:
    model = elem
    model.fit(x_array, y)
    scores = cross_val_score(model, x_array, y, cv=cv, scoring='accuracy').mean()


    if isinstance(elem, RandomForestClassifier):
        rfc_accuracies.append(scores)
        rfc_mcc.append(matthews_corrcoef(y, model.predict(x_array)))
    elif isinstance(elem, SVC):
        svc_accuracies.append(scores)
        svc_mcc.append(matthews_corrcoef(y, model.predict(x_array)))
    elif isinstance(elem, LogisticRegression):
        lr_accuracies.append(scores)
        lr_mcc.append(matthews_corrcoef(y, model.predict(x_array)))
    else:
        cat_accuracies.append(scores)
        cat_mcc.append(matthews_corrcoef(y, model.predict(x_array)))

In [23]:
print(f'Random Forest Classifier accuracy: {rfc_accuracies[0]*100:.2f} %')
print(f'Random Forest Classifier MCC: {rfc_mcc[0]:.2f}')
print()
print(f'Support Vector Classifier accuracy: {svc_accuracies[0]*100:.2f} %')
print(f'Support Vector Classifier MCC: {svc_mcc[0]:.2f}')
print()
print(f'Logistic Regression accuracy: {lr_accuracies[0]*100:.2f} %')
print(f'Logistic Regression MCC: {lr_mcc[0]}')
print()
print(f'CatBoost Classifier accuracy: {cat_accuracies[0]*100:.2f} %')
print(f'CatBoost Classifier MCC: {cat_mcc[0]:.2f}')

Random Forest Classifier accuracy: 93.93 %
Random Forest Classifier MCC: 1.00

Support Vector Classifier accuracy: 94.49 %
Support Vector Classifier MCC: 0.95

Logistic Regression accuracy: 95.05 %
Logistic Regression MCC: 0.9876626841002021

CatBoost Classifier accuracy: 94.49 %
CatBoost Classifier MCC: 0.99


# **4) SVC and Logistic Regression optimization**

## **4.1) SVM Classifier optimization**

In [13]:
def objective(trial):
    C = trial.suggest_float('C', 1e-5, 100)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_float('gamma', 1e-5, 1)
    degree = trial.suggest_int('degree', 2, 5)
    coef0 = trial.suggest_float('coef0', -1, 1)
    shrinking = trial.suggest_categorical('shrinking', [True, False])
    tol = trial.suggest_float('tol', 1e-5, 1e-1)
    max_iter = trial.suggest_int('max_iter', 100, 10000)

    model = SVC(C=C,
                kernel=kernel,
                gamma=gamma,
                degree=degree,
                coef0=coef0,
                shrinking=shrinking,
                tol=tol,
                max_iter=max_iter,
                random_state=42)

    score = cross_val_score(model, x_array, y, cv=cv, scoring='accuracy').mean()
    return score

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-01-02 12:54:33,413] A new study created in memory with name: no-name-f01b6cc6-cc70-4331-8df5-93e13bfe4441
[I 2025-01-02 12:54:33,481] Trial 0 finished with value: 0.9174174174174174 and parameters: {'C': 28.944327305620043, 'kernel': 'linear', 'gamma': 0.599015845588658, 'degree': 2, 'coef0': 0.8469811340249953, 'shrinking': True, 'tol': 0.07255708267689115, 'max_iter': 1716}. Best is trial 0 with value: 0.9174174174174174.
[I 2025-01-02 12:54:33,539] Trial 1 finished with value: 0.8393393393393394 and parameters: {'C': 33.838064780490534, 'kernel': 'sigmoid', 'gamma': 0.11591317504177606, 'degree': 5, 'coef0': -0.764078816398533, 'shrinking': False, 'tol': 0.09267144538922127, 'max_iter': 3276}. Best is trial 0 with value: 0.9174174174174174.
[I 2025-01-02 12:54:33,701] Trial 2 finished with value: 0.7183183183183183 and parameters: {'C': 13.255633073105479, 'kernel': 'rbf', 'gamma': 0.7294709470660947, 'degree': 5, 'coef0': 0.591015035369574, 'shrinking': True, 'tol': 0.04768

In [15]:
best_params = study.best_params
C= best_params['C']
kernel= best_params['kernel']
gamma= best_params['gamma']
degree= best_params['degree']
coef0= best_params['coef0']
shrinking= best_params['shrinking']
tol= best_params['tol']
max_iter= best_params['max_iter']

best_svm_model = SVC(C=C,
                kernel=kernel,
                gamma=gamma,
                degree=degree,
                coef0=coef0,
                shrinking=shrinking,
                tol=tol,
                max_iter=max_iter,
                random_state=42)

best_svm_model.fit(x_array, y)

In [16]:
print(accuracy_score(y, best_svm_model.predict(x_array)))
print(matthews_corrcoef(y, best_svm_model.predict(x_array)))

0.9834254143646409
0.9631349585705171


## **4.2) LogisticRegression optimization**

In [17]:
def objective(trial):
  C = trial.suggest_float('C', 1e-4, 1e2)
  fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])

  model = LogisticRegression(
      C=C,
      fit_intercept=fit_intercept
  )

  score = cross_val_score(model, x_array, y, cv=cv, scoring='accuracy').mean()
  return score

In [18]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-01-02 12:55:00,092] A new study created in memory with name: no-name-880aba3f-4075-4071-bf9f-c6daac5bbea1
[I 2025-01-02 12:55:00,384] Trial 0 finished with value: 0.9394894894894895 and parameters: {'C': 15.823935025825795, 'fit_intercept': False}. Best is trial 0 with value: 0.9394894894894895.
[I 2025-01-02 12:55:00,687] Trial 1 finished with value: 0.945045045045045 and parameters: {'C': 45.76225001896282, 'fit_intercept': True}. Best is trial 1 with value: 0.945045045045045.
[I 2025-01-02 12:55:00,923] Trial 2 finished with value: 0.945045045045045 and parameters: {'C': 73.86331826063436, 'fit_intercept': True}. Best is trial 1 with value: 0.945045045045045.
[I 2025-01-02 12:55:01,225] Trial 3 finished with value: 0.945045045045045 and parameters: {'C': 33.41067114375974, 'fit_intercept': True}. Best is trial 1 with value: 0.945045045045045.
[I 2025-01-02 12:55:01,473] Trial 4 finished with value: 0.945045045045045 and parameters: {'C': 64.2189652653044, 'fit_intercept': Tr

In [19]:
best_params = study.best_params
C= best_params['C']
fit_intercept= best_params['fit_intercept']

best_lr_model = LogisticRegression(
    C=C,
    fit_intercept=fit_intercept,
    random_state=42
)

best_lr_model.fit(x_array, y)

In [20]:
print(accuracy_score(y, best_lr_model.predict(x_array)))
print(matthews_corrcoef(y, best_lr_model.predict(x_array)))

0.994475138121547
0.9876626841002021
