# **1) Initiall Instructions**

In [None]:
!pip install -q rdkit

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import glob
import os
import numpy as np
import tensorflow as tf
import cv2
import seaborn as sns
from joblib import dump
from collections import Counter

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, matthews_corrcoef
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem.AllChem import GetMorganGenerator
from rdkit.Chem.SaltRemover import SaltRemover

import tensorflow as tf   
from tensorflow import keras
from tensorflow.keras import applications
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.callbacks import EarlyStopping

tf.keras.utils.set_random_seed(42)

# **2) Data Preparation**

In [None]:
#Path to DataFrame containing parsed molecules from ChEMBL
data = pd.read_csv(r'/content/drive/MyDrive/path', usecols=['Molecule ChEMBL ID', 'Smiles', 'Bin_Activity'])
df = pd.DataFrame(data)

list_of_esters = ['CHEMBL4846931', 'CHEMBL4849680', 'CHEMBL4863967', 'CHEMBL4867606', 'CHEMBL4874372', 'CHEMBL5431767', 'CHEMBL5430776', 'CHEMBL5412404', 'CHEMBL5398411', 'CHEMBL5438068']
df = df[~df['Molecule ChEMBL ID'].isin(list_of_esters)]

In [None]:
"""In order to keep the datasets equal for both variants of images, it is necessary to perform the intersection of both sets"""

#Path to images after docking
path_dock = r'/content/drive/MyDrive/CNNDock_2__poprawki/datasety_od_nowa/dokowanie/images_docking_sprawdzona_rozdzielczosc/images_docking_sprawdzona_rozdzielczosc'

#Path to images after DFT optimization
path_quanta = r'/content/drive/MyDrive/CNNDock_2__poprawki/datasety_od_nowa/kwanty/images_quanta/images_dft'


list_dock = [elem.split('_')[0] for elem in os.listdir(path_dock)]
list_quanta = [elem.split('_')[0] for elem in os.listdir(path_quanta)]

print(len(list_dock))
print(len(list_quanta))

common_idx = list(set(list_dock) & set(list_quanta))
print(len(common_idx))

In [None]:
"""This section is responsible for loading images. Below are the paths (commented) to 2 data sets: images after docking and after DFT minimization.
To load a specific data set, uncomment one particular line of code"""

#path = r'/content/drive/MyDrive/CNNDock_2__poprawki/datasety_od_nowa/dokowanie/images_docking_sprawdzona_rozdzielczosc/images_docking_sprawdzona_rozdzielczosc/**/*.png'
#path = r'/content/drive/MyDrive/CNNDock_2__poprawki/datasety_od_nowa/kwanty/images_quanta/images_dft/**/*.png'

images = glob.glob(path, recursive=True)

img_list = []
img_labels = []
id_counter = Counter()
chembl_idx = []

for idx, image in enumerate(images):
    filename = os.path.basename(image)
    image_id = next((part for part in filename.split('_') if part.startswith('CHEMBL')), None)

    if image_id and image_id in common_idx:
        id_counter[image_id] += 1
        chembl_idx.append(image_id)

        img = cv2.imread(image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        resized_image = cv2.resize(img, (224, 224), interpolation=cv2.INTER_AREA)

        img_list.append(resized_image)
        img_labels.append(df[df['Molecule ChEMBL ID'] == image_id]['Bin_Activity'].values[0])

print(len(img_list))
print(len(img_labels))
print()
print(id_counter.most_common(10))

In [None]:
#Check whether the images are loaded correctly
plt.imshow(img_list[0])

In [None]:
#Converting lists to arrays
img_labels = np.array(img_labels)
img_arr = np.array(img_list)

#Scaling values to 0-1 range
img_arr = img_arr.astype(np.float32, copy=False)
img_arr /= 255.0

# **3) 5-Fold Crossvalidation**

In [None]:
#5-fold crossvalidation

callback = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
fold_no = 1
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_folds_scores_history = {'val_acc': [], 'mcc': []}

for train, test in skfold.split(img_arr, img_labels):
    base_model = DenseNet201(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    base_model.trainable = False

    x = keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.Dense(64, activation='relu')(x)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.Dense(32, activation='relu')(x)
    x = keras.layers.Dropout(0.3)(x)
    output = keras.layers.Dense(1, activation='sigmoid')(x)

    model = keras.models.Model(inputs=base_model.input, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

    print(f'Fold {fold_no} ...')

    history = model.fit(
        img_arr[train], img_labels[train],
        batch_size=32,
        epochs=20,
        verbose=0,
        callbacks=[callback],
        validation_data=(img_arr[test], img_labels[test])
    )

    mean_acc_per_fold = np.mean(history.history['val_accuracy'])
    all_folds_scores_history['val_acc'].append(mean_acc_per_fold)

    y_pred_probs = model.predict(img_arr[test])
    y_pred = (y_pred_probs > 0.5).astype(int)
    mcc = matthews_corrcoef(img_labels[test], y_pred)
    all_folds_scores_history['mcc'].append(mcc)

    fold_no += 1

print()
print(f'Average Accuracy for 5-fold crossvalidation: {np.mean(all_folds_scores_history["val_acc"]):.2f}')
print(f'Average MCC for 5-fold crossvalidation: {np.mean(all_folds_scores_history["mcc"]):.2f}')

# **4) Model Building, Training and Predicition**

In [None]:
# Creating a model - frozen (base_model.trainable = False) "imagenet" weights were used
base_model = DenseNet201(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

x = keras.layers.GlobalAveragePooling2D()(base_model.output)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dropout(0.3)(x)
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dropout(0.3)(x)
x = keras.layers.Dense(32, activation='relu')(x)
x = keras.layers.Dropout(0.3)(x)
output = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.models.Model(inputs=base_model.input, outputs=output,)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#Data splitting (train set and test set) and training (with EarlyStopping to prevent overfitting)
X_train_img, X_test_img, y_train, y_test, chembl_train, chembl_test = train_test_split(
    img_arr, img_labels, chembl_idx,
    test_size=0.2,
    random_state=42,
    stratify=img_labels
)
callback2 = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)


history = model.fit(X_train_img, y_train, epochs=20, batch_size=32, validation_data=(X_test_img, y_test), callbacks=[callback2])

In [None]:
#Training visualizations
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(history.history['accuracy'], label='Training Accuracy')
ax[0].plot(history.history['val_accuracy'], label='Validation Accuracy')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')
ax[0].set_title('Training and Validation Accuracy')
ax[0].legend()


ax[1].plot(history.history['loss'], label='Training Loss')
ax[1].plot(history.history['val_loss'], label='Validation Loss')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss')
ax[1].set_title('Training and Validation Loss')
ax[1].legend()

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#Test set labels prediction
y_pred = model.predict(X_test_img)
y_pred = np.where(y_pred > 0.5, 1, 0)

test_acc = accuracy_score(y_test, y_pred)
test_classification_report = classification_report(y_test, y_pred)
test_confusion_matrix = confusion_matrix(y_test, y_pred)
test_mcc = matthews_corrcoef(y_test, y_pred)

print(f'Test Accuracy: {test_acc}')
print(f'Test MCC: {test_mcc}')
print(f'Test Classification Report:\n{test_classification_report}')

In [None]:
#Test set labels prediction - confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(test_confusion_matrix, annot=True, fmt='d', cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.text(0, 2.4, f'Test accuracy: {100*test_acc:.2f}%', fontsize=15)
plt.text(0, 2.5, f'Test mcc: {test_mcc:.2f}', fontsize=15)

In [None]:
#Labels comparison (true vs predicted for test set)
df = pd.DataFrame({
    'chembl_id': chembl_test,
    'true': y_test,
    'pred': y_pred.flatten()
})

for chembl_id, group in df.groupby('chembl_id'):
    true_vals = group['true'].values
    pred_vals = group['pred'].values
    if not np.array_equal(true_vals, pred_vals):
        print(f"Chembl ID: {chembl_id}.Bad predictions {30*'❌'}")
    print(f"{chembl_id}")
    print(f"True:      {true_vals}")
    print(f"Predicted: {pred_vals}")
    print("-" * 30)


In [None]:
"""This commented cell allows user to save trained model to particular directory"""

# os.chdir('/content/drive/MyDrive/pathtosave')

# cnnmodel = model
# path = os.path.join(os.getcwd(), 'model.joblib')
# dump(cnnmodel, path)