# Introduction

**Using the "Hash-Trick": Calculate Fingerprints for Molecules and build a neural network for prediction of the binding affinity.**

Link to the competition and data: https://www.kaggle.com/competitions/leash-BELKA/overview

**Finding a good topology for the net still poses a big challenge.**
- Here, we build three independent models for each protein, which yields a strong improvement.
- In another notebook, we build models only depending on the building blocks of the molecules. This seems to be a good choice, if the test data is also build from these building blocks. But here, also other building blocks appear in the test data. However, the imporvement is huge: Factor of 2!

**One can try to use TPUs instead of CPU/GPU. But one may end up waiting in line. (The TPU-code may be commented.)**

**IDEAS TO IMPROVE:**
- Search for pretrained nets online. (There are many publications on this subject.)
- Use Graph Neural Networks
- Use other finger prints
- Incorporate Validation into the training, and do some Gridsearches.


# Switches:

In [54]:
# If submitting this file, different parameters will be used
submit = True

# Imports

In [55]:
import tensorflow as tf
import os
import numpy as np
import sys
import pandas as pd
tf.keras.utils.set_random_seed(42)

# rdkit helps generating characteristics of molecules:
if submit:
    !pip install rdkit  --no-dependencies
else:
    # only install if not installed anymore
    if not os.path.isdir('/kaggle/working/mysitepackages'):
        # it was important to use "--no-dependencies" Otherwise submissions would not work anymore!!
        !pip install rdkit  --no-dependencies --target=/kaggle/working/mysitepackages
    sys.path.append('/kaggle/working/mysitepackages')
import rdkit
import rdkit.Chem as Chem
from rdkit.Chem import AllChem

gpu_name = tf.test.gpu_device_name()
if "GPU" not in gpu_name:
    print("GPU device not found")
else:
    print('Found GPU at: {}'.format(gpu_name))

GPU device not found


# Select Hyperparameters

In [56]:
# Parameters for testing (small numbers):

N_BITS_FINGERPRINT = 1024 # for ECFP-Fingerprint
N_RADIUS = 4 # for ECFP-Fingerprint
BATCH_SIZE = 25
N_TRAIN = 5000#00#-1 # set to -1 for "all"
N_TEST = 100#-1 # set to -1 for "all"
N_EPOCHS = 1
WITH_DROPOUT = False
DROPOUT_RATE = 0.05
ACTIVATION = 'relu'
#ACTIVATION = 'linear'
#ACTIVATION = tf.keras.layers.LeakyReLU(negative_slope=0.01)
HIDDEN_NEURONS = [200, 5] #Numbers of neurons per internal layer

# Parameters for submissions (larger numbers):
if submit:
    BATCH_SIZE = 64*3
    N_BITS_FINGERPRINT = 2048 # for ECFP-Fingerprint
    N_RADIUS = 4 # for ECFP-Fingerprint
    N_TRAIN = -1 # -1 means "all"
    N_TEST = -1 # -1 means "all"
    N_EPOCHS = 5
    HIDDEN_NEURONS = [251, 15]
    #ACTIVATION = 'relu'
    ACTIVATION = tf.keras.layers.LeakyReLU(negative_slope=0.01)
    WITH_DROPOUT = False
    DROPOUT_RATE = 0.005 # only needed if WITH_DROPOUT
    
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE
if N_TRAIN == -1:
    STEPS_PER_EPOCH = 98415610 // BATCH_SIZE # THE NUMBER 98415610 is taken from the data description page.


In [57]:
train_path = '/kaggle/input/leash-BELKA/train.csv'
positives_train_path = '/kaggle/working/positive_train.csv'
test_path = '/kaggle/input/leash-BELKA/test.csv'

# Getting Datasets with Fingerprint information

In [58]:
ds = tf.data.experimental.make_csv_dataset(
                train_path,
                batch_size=1,
                shuffle=False,
                num_epochs=1, # to prevent repeat()
                label_name='binds',
                select_columns=['protein_name', 'molecule_smiles', 'binds'])

### Creating File with positive samples
That's important to provide well-distributed input data to the training

In [59]:
def create_file_of_positives(original_path, positives_path, n_samples):
    """ n_samples=-1 means: take all"""
    chunksize=10000
    for i, chunk in enumerate(pd.read_csv(train_path, chunksize=chunksize)):
        positives_lines = chunk[chunk.binds==1]
        
        # writing the first line will overwrite old files:
        mode = 'w' if i==0 else 'a' 
        add_header = True if i==0 else False
        
        positives_lines.to_csv(positives_path, mode=mode, index=False, header=add_header)
        if i%1000 == 0:
            print(i+1, 'of', 3*98415610/chunksize, "chunks searched for positive samples...")
        if i>=1 and i*chunksize >= n_samples and (i-1)*chunksize <= n_samples:
            # condidion is complicated to treat the special case n_samples=-1..
            break
    print("Done extracting positive samples.")


create_file_of_positives(train_path, positives_train_path, n_samples=N_TRAIN)
ds_positives = tf.data.experimental.make_csv_dataset(
            positives_train_path,
            batch_size=1,
            shuffle=False,
            num_epochs=1, # to prevent repeat()
            label_name='binds',
            select_columns=['protein_name', 'molecule_smiles', 'binds'])

    

1 of 29524.683 chunks searched for positive samples...
Done extracting positive samples.


### Merge Datasets

In [60]:
def merged_ds(ds1, ds2):
    """ takes unbatched and repeated datasets"""
    ds = tf.data.Dataset.zip((ds1, ds2)).flat_map(
                lambda x,y : tf.data.Dataset.from_tensors(x).concatenate(tf.data.Dataset.from_tensors(y)))
    return ds

ds_merged = merged_ds(ds.repeat(), ds_positives.repeat())
ds_merged = ds_merged.batch(BATCH_SIZE)
ds_merged = ds_merged.prefetch(tf.data.AUTOTUNE)

# Callbacks

For stopping after some hours:

In [61]:
class TimeStopping(tf.keras.callbacks.Callback):
    def __init__(self, max_hours=3):
        super().__init__()
        self.max_seconds = max_hours*60*60
        self.start_time = None

    def on_train_begin(self, logs=None):
        import time
        self.start_time = time.time()

    def on_batch_end(self, batch, logs=None):
        import time
        if time.time() - self.start_time >= self.max_seconds:
            self.model.stop_training = True
            print("\nTime over! Stopping the training..")

In case that Nan-Values appear:

# Split-Model Topology

In [62]:
class FingerprintLayer(tf.keras.layers.Layer):
    def __init__(self, radius=N_RADIUS, n_bits=N_BITS_FINGERPRINT):
        super().__init__()
        self.radius = radius
        self.n_bits = n_bits
        self.fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
        
        def get_fp(smile):
            @tf.py_function(Tout=tf.int32)
            def get_fp_py_function(smile):
                mol = Chem.MolFromSmiles(smile.numpy()[0])
                fp_list = self.fp_generator.GetFingerprint(mol).ToList()
                return tf.convert_to_tensor(fp_list, dtype=tf.int32)
            res = get_fp_py_function(smile)
            res.set_shape((self.n_bits,))
            return res
        self.get_fp = get_fp

    def build(self, input_shape):
        # No trainable weights to define
        super().build(input_shape)

    def call(self, smiles):
        # Apply the fingerprint calculation to each SMILES string
        return tf.map_fn(self.get_fp, smiles, dtype=tf.int32)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.n_bits

    def get_config(self):
        config = super().get_config()
        config.update({
            'radius': self.radius,
            'n_bits': self.n_bits,
        })
        return config

In [63]:
class OneHot(tf.keras.layers.Layer):
    def __int__(self):
        super().__init__()
    def build(self, input_shape):
        super().build(input_shape)
    def call(self, inputs):
        return tf.math.equal(inputs, tf.constant(['BRD4', 'sEH', 'HSA'], shape=(1,3)))


        

I want to write a layer that controls the routing into the different submodels. It will  reduce unnecessary calculations:

In [64]:
#class RoutingLayer(tf.keras.layers.Layer):
#    def __init__(self, model_BRD4, model_sEH, model_HSA):
#        super().__init__()
#        self.model_BRD4 = model_BRD4
#        self.model_sEH = model_sEH
#        self.model_HSA = model_HSA

#    def call(self, inputs):
#        protein_idx, fp = inputs
#        
#        def route_fn(x):
#            protein_idx, fp = x
#            protein_idx = tf.cast(protein_idx, tf.int32)
#            
#            return tf.switch_case(
#                branch_index=protein_idx,
#                branch_fns={
#                    0: lambda: self.model_BRD4(fp),
#                    1: lambda: self.model_sEH(fp),
#                    2: lambda: self.model_HSA(fp)
#                }
#            )
        
#        routed_outputs = tf.vectorized_map(route_fn, (protein_idx, fp))
#        return routed_outputs

In [68]:
def splitted_model():
    ### creates single models for each target protein and concatenates them together
    
    def model_for_one_protein():
        fp_input = tf.keras.Input((N_BITS_FINGERPRINT,), name='fp')
        x = fp_input
        for N in HIDDEN_NEURONS:
            if WITH_DROPOUT:
                x = tf.keras.layers.Dropout(DROPOUT_RATE)(x)
            x = tf.keras.layers.Dense(N, activation=ACTIVATION)(x)
        output = tf.keras.layers.Dense(1)(x)
        return tf.keras.Model(fp_input, output)
    
    # Inputs:
    smile_input = tf.keras.Input(shape=(1,), name='molecule_smiles', dtype=tf.string)
    protein_input = tf.keras.Input((1,), name='protein_name', dtype=tf.string)
    
    # preprocessing:
    fp = FingerprintLayer()(smile_input)
    oneHot = OneHot()(protein_input)
    
    # Submodels:
    model_BRD4 = model_for_one_protein()(fp)
    model_HSA  = model_for_one_protein()(fp)
    model_sEH  = model_for_one_protein()(fp)
    
    # Routing into appropriate submodels:
    single_models = tf.keras.layers.Concatenate(axis=1)([model_BRD4, model_HSA, model_sEH])
    output = tf.keras.layers.Dot(axes=[1,1])([oneHot, single_models])
    
    model = tf.keras.Model(inputs=[smile_input, protein_input], outputs=output)
    
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer="adam",
                  loss=loss_fn,
                  metrics=['accuracy']
                 #,run_eagerly=True
                 )
    return model

# Training

In [69]:
# if TPU is available:
#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.tpu.experimental.initialize_tpu_system(tpu)
#tpu_strategy = tf.distribute.TPUStrategy(tpu)
#with tpu_strategy.scope():

tf.keras.utils.set_random_seed(42)

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = splitted_model()
    model.fit(ds_merged,
              epochs=N_EPOCHS,
              steps_per_epoch=STEPS_PER_EPOCH
              ,callbacks=[TimeStopping(max_hours=4)]
             )

[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.9251 - loss: 0.2302


# Predict on Test Set

In [70]:
print("Start predicting....")
test_ds = tf.data.experimental.make_csv_dataset(
            test_path,
            batch_size=1,
            shuffle=False,
            num_epochs=1,
            select_columns=['id', 'protein_name', 'molecule_smiles'])
test_ds = test_ds.take(N_TEST).batch(256)
test_ds = test_ds.prefetch(tf.data.AUTOTUNE)
y = model.predict(test_ds, verbose=0)
print('Done predicting')

Start predicting....
Done predicting


  self.gen.throw(typ, value, traceback)


## Write Submission File

In [71]:
import pandas as pd

probs = tf.math.sigmoid(y[:,0]).numpy()
ids = [id.numpy()[0] for id in test_ds.unbatch().map(lambda x: x['id'])]
d = pd.DataFrame({'id': ids,
                  'binds': probs})
print("Start writing...")
d.to_csv('submission.csv', index=False, header=True)

Start writing...


## Some Statistics

In [72]:
print(d['binds'].describe())

count    100.000000
mean       0.276106
std        0.274597
min        0.000126
25%        0.008788
50%        0.184567
75%        0.489963
max        0.950906
Name: binds, dtype: float64
