# **1) Initiall instructions**

In [None]:
!pip install optuna
!pip install scikeras
!pip install rdkit

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.7-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.7-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import os
import glob
import optuna
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score

from rdkit import Chem
from rdkit.Chem.AllChem import GetMorganGenerator

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **2) Data preprocessing**

In [None]:
seed = 42
tf.keras.utils.set_random_seed(seed)

dataset = pd.read_csv('filepath') #path to file with bioactivity and smiles
df = pd.DataFrame(data=dataset, columns=['Molecule_ChEMBL_ID', 'Standard_Value', 'Smiles'])

df['Bin_Activity'] = np.where(df['Standard_Value'] <= 50, 1, 0)
df.drop(['Standard_Value'], axis='columns', inplace=True)

images_path = r"example_dataset/**/CHEMBL*/image_CHEMBL*.png" #path to directory with images.
                                                              #Due to architecture of dataset I have used glob.glob(recursive=True)
                                                              #to search recursively

images = []
labels = []
fps = []

morgan_gen = GetMorganGenerator(radius=2, fpSize=2048)

for img_path in glob.glob(images_path, recursive=True):
    img = keras.utils.load_img(img_path, target_size=(224, 224))
    x = keras.utils.img_to_array(img)
    images.append(x)

    file_name = os.path.basename(img_path)
    ch_embl_id = file_name.split('__')[0].replace("image_", "")

    matching_row = df[df['Molecule_ChEMBL_ID'] == ch_embl_id]

    if not matching_row.empty:
        label = matching_row['Bin_Activity'].values[0]
        labels.append(label)

        smiles = matching_row['Smiles'].values[0]
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fingerprint = morgan_gen.GetFingerprint(mol)
            fps.append(fingerprint)
        else:
            print(f"Nie udało się wygenerować fingerprintu dla: {ch_embl_id}")
    else:
        print(f"Brak etykiety dla: {ch_embl_id}")

images_array = np.array(images) / 255.0
labels_array = np.array(labels)
fps_array = np.array([list(map(int, fp.ToBitString())) for fp in fps])

# **3) Model trainining and validation**

In [None]:
%%time
class MultiInputKerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model_fn, epochs=20, verbose=0):
        self.model_fn = model_fn
        self.epochs = epochs
        self.verbose = verbose
        self.model = None

    def fit(self, X, y):
        image_input_data = X[0]
        fingerprint_input_data = X[1]

        self.model = self.model_fn()
        self.model.fit([image_input_data, fingerprint_input_data], y, epochs=self.epochs, verbose=self.verbose)
        return self

    def predict(self, X):
        image_input_data = X[0]
        fingerprint_input_data = X[1]

        return self.model.predict([image_input_data, fingerprint_input_data])


def create_model():
    image_input = keras.layers.Input(shape=(224, 224, 3), name="image_input")
    base_model = keras.applications.DenseNet121(weights='imagenet', include_top=False, input_tensor=image_input)

    for layer in base_model.layers:
        layer.trainable = False

    x = keras.layers.GlobalAveragePooling2D()(base_model.output)
    fingerprint_input = keras.layers.Input(shape=(2048,), name="fingerprint_input")
    fingerprint_dense = keras.layers.Dense(128, activation='relu')(fingerprint_input)
    concatenated = keras.layers.Concatenate()([x, fingerprint_dense])
    dense1 = keras.layers.Dense(8, activation='relu')(concatenated)
    dropout1 = keras.layers.Dropout(0.3)(dense1)
    output = keras.layers.Dense(1, activation='sigmoid')(dropout1)

    model = keras.Model(inputs=[image_input, fingerprint_input], outputs=output)
    model.compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    return model


image_input_data = images_array
fingerprint_input_data = np.array(fps_array)

X_combined = (image_input_data, fingerprint_input_data)
y = labels_array

keras_classifier = MultiInputKerasClassifier(
    model_fn=create_model,
    epochs=20,
    verbose=0
)

cv = KFold(n_splits=5, random_state=42, shuffle=True)

accuracies = []
mcc = []
recall = []

for train_index, test_index in cv.split(X_combined[0]):
    X_train_image, X_test_image = X_combined[0][train_index], X_combined[0][test_index]
    X_train_fp, X_test_fp = X_combined[1][train_index], X_combined[1][test_index]
    y_train, y_test = y[train_index], y[test_index]

    keras_classifier.fit((X_train_image, X_train_fp), y_train)
    y_pred = keras_classifier.predict((X_test_image, X_test_fp))

    accuracy_var = accuracy_score(y_test, (y_pred > 0.5).astype(int))
    accuracies.append(accuracy_var)

    mcc_var = matthews_corrcoef(y_test, (y_pred > 0.5).astype(int))
    mcc.append(mcc_var)

    recall_var = recall_score(y_test, (y_pred > 0.5).astype(int))
    recall.append(recall_var)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [None]:
print('-----------------------------------------')
print('METRIC: ACCURACY')
for nr_acc, score_acc in enumerate(accuracies, 1):
    print(f"\tScore for split {nr_acc} is: {score_acc}")

print()
print('\tMean accuracy: ', np.mean(accuracies))
print('-----------------------------------------')


print('METRIC: MCC')
for nr_mcc, score_mcc in enumerate(mcc, 1):
    print(f"\tScore for split {nr_mcc} is: {score_mcc}")

print()
print('\tMean accuracy: ', np.mean(mcc))
print('-----------------------------------------')


print('METRIC: RECALL')
for nr_rec, score_rec in enumerate(recall, 1):
    print(f"\tScore for split {nr_rec} is: {score_rec}")

print()
print('\tMean accuracy: ', np.mean(recall))
print('-----------------------------------------')