## Introduction

**Using the "Hash-Trick": Calculate Fingerprints for Molecules and build a neural network for prediction of the binding affinity.**

**Finding a good topology for the net still poses a big challenge.**

**Since the dataset is too big, we need to provide a data stream.
I thought it might be better to prepare TFRecord-Files to speed up the data supply.
But the data is even too much for my 20GB disc memory on Kaggle. So using a datastream from the provided files may be the only option to train on all of the data.**

**One can try to use TPUs instead of CPU/GPU. But one may end up waiting in line. (The TPU-code may be commented.)**

**IDEAS TO IMPROVE:**
- Search for pretrained nets online. (There are many publications in this subject.)
- Use Graph Neural Networks
- Use other finger prints (e.g. 3D-finger prints)

In [None]:
import tensorflow as tf
import os

# rdkit helps generating characteristics of molecules:
!pip install rdkit
import rdkit
import rdkit.Chem as Chem
from rdkit.Chem import AllChem

gpu_name = tf.test.gpu_device_name()
if "GPU" not in gpu_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(gpu_name))

## Switches:

In [None]:
# Original data gets preprocessed and saved into TFRecord files.
# Otherwise use only csv
use_tfRecords = False

## Select Hyperparameters

In [None]:
N_BITS_FINGERPRINT = 512
BATCH_SIZE = 64
N_TRAIN = BATCH_SIZE * 2000 # set to -1 for "all"
N_TEST = -1 # set to -1 for "all"
N_EPOCHS = 7

#Numbers of neurons per internal layer:
N_Layer_1 = 100
N_Layer_2 = 50
N_Layer_3 = 10
N_Layer_4 = 5

## Functions for creating TFRecords File

In [None]:
train_path = '/kaggle/input/leash-BELKA/train.csv'
test_path = '/kaggle/input/leash-BELKA/test.csv'
preproc_train_path = "/kaggle/working/train_prepro.tfrecord"
preproc_test_path = "/kaggle/working/test_prepro.tfrecord"

In [None]:
def generate_ecfp(molecule, radius=2, bits=N_BITS_FINGERPRINT):
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

In [None]:
def create_preproc_file(path_in, path_out, n_entries = -1):
    ### Calculates the fingerprint of each molecule
    ### and saves this into the TFRecords file.

    # The following functions can be used to convert a value to a type compatible
    # with tf.train.Example.
    def _bytes_feature(value):
      """Returns a bytes_list from a string / byte."""
      if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    def _float_feature(value):
        """Returns a float_list from a float / double."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    def _int64_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    
    def serialize_example(id_,hash_,protein_name,binds=None):
        features = {
            'id': _int64_feature(id_),
            'hash': _bytes_feature(tf.io.serialize_tensor(tf.constant(hash_))),
            'protein_name': _bytes_feature(str.encode(protein_name)),
          }
        if binds!=None:
            features['binds'] = _int64_feature(binds)
        example = tf.train.Example(features=tf.train.Features(feature=features))
        return example.SerializeToString()

    # The actual creation of the file:
    try:
        os.remove(path_out)
    except FileNotFoundError:
        pass
    with open(path_in) as f_in, tf.io.TFRecordWriter(path_out) as writer:
        first_line = f_in.readline() # skip headers
        labeled = (len(first_line.split(",")))==7
        for i, line in enumerate(f_in):
            if i==n_entries:
                break
            if i % 10000 == 0:
                print(i, "examples processed")
            features = line.split(",")
            molecule = Chem.MolFromSmiles(features[4])
            ecfp = generate_ecfp(molecule)
            if labeled:
                example = serialize_example(int(features[0]),
                                        ecfp,
                                        features[5],
                                        int(features[6]))
                writer.write(example)
            else:
                example = serialize_example(int(features[0]),
                                        ecfp,
                                        features[5])
                writer.write(example)
        print('done processing examples')

## Functions for reading the TFRecords

In [None]:
def _parse_function(example, label=True):
    # Set label=False if processing test data.
    # Parse the input `tf.train.Example` proto using the dictionary:
    feature_description = {
      'id':           tf.io.FixedLenFeature([], tf.int64),
      'hash':         tf.io.FixedLenFeature([], tf.string),
      'protein_name': tf.io.FixedLenFeature([], tf.string),
    }
    if label:
        feature_description['binds'] = tf.io.FixedLenFeature([1], tf.int64)
    
    features = tf.io.parse_single_example(example, feature_description)
    id_     = features['id']
    hash_   = tf.io.parse_tensor(features['hash'], out_type=tf.int32)
    hash_   = tf.reshape(hash_, shape=(N_BITS_FINGERPRINT,))  # needed to bring back the shape to former string-byte
    protein_name = features['protein_name']
    if label:
        return id_, hash_, protein_name, features['binds']
    else:
        return id_, hash_, protein_name

def _parse_hash_oneHot_binds(example_proto):
    ### Returns only the Fingerprint(hash), the one-Hot-encoding of the protein and the binds-value
    id_, hash_, protein_name, binds = _parse_function(example_proto)
    oneHot = tf.math.equal(protein_name, ['BRD4', 'sEH', 'HSA'])
    oneHot   = tf.reshape(oneHot, (3,))
    return {'hash': hash_, 'oneHot': oneHot}, binds

def _parse_hash_oneHot(example_proto):
    ### Returns only the Fingerprint(hash) and the one-Hot-encoding of the protein
    id_, hash_, protein_name, = _parse_function(example_proto, label=False)
    oneHot = tf.math.equal(protein_name, ['BRD4', 'sEH', 'HSA'])
    oneHot   = tf.reshape(oneHot, (3,))
    return {'hash': hash_, 'oneHot': oneHot}

def _parse_hash_binds(example_proto):
    ### Returns only the Fingerprint(hash) and the binds-value
    id_, hash_, protein_name, binds = _parse_function(example_proto)
    return hash_, binds

## Functions for reading csv

In [None]:
@tf.py_function(Tout=tf.int32)
def get_fp(smile):
    mol = Chem.MolFromSmiles(smile.numpy())
    fp = generate_ecfp(mol)
    fp = tf.constant(fp, dtype=tf.int32, shape=(N_BITS_FINGERPRINT,))
    return fp

def _parse_csv_hash_oneHot_binds(x,y):
    oneHot = tf.math.equal(x['protein_name'], ['BRD4', 'sEH', 'HSA'])
    oneHot   = tf.reshape(oneHot,(3,))
    hash_ = get_fp(x['molecule_smiles'])
    hash_   = tf.reshape(hash_, shape=(N_BITS_FINGERPRINT,))
    y = tf.reshape(y, shape=(1,))
    y = tf.cast(y, tf.int64)
    return {'hash': hash_, 'oneHot': oneHot}, y

def _parse_csv_hash_oneHot(x):
    oneHot = tf.math.equal(x['protein_name'], ['BRD4', 'sEH', 'HSA'])
    oneHot   = tf.reshape(oneHot,(3,))
    hash_ = get_fp(x['molecule_smiles'])
    return {'hash': hash_, 'oneHot': oneHot}

## Get Dataset from either csv or TFrecords:

In [None]:
def get_ds_tfRecord(csv_path, tfRecords_path, batch_size, n_entries=-1, labeled=True):
    # creates the tfRecords-File from the csv-File
    # n_entries=-1 means that all samples are to be taken from the csv
    
    # determines automatically if labeled==True: (could be changed!)
    create_preproc_file(csv_path,
                        tfRecords_path,
                        n_entries = n_entries)
    ds = tf.data.TFRecordDataset(tfRecords_path)
    if labeled:
        ds = ds.map(_parse_hash_oneHot_binds)
    else:
        ds = ds.map(_parse_hash_oneHot)
    ds = ds.batch(batch_size)
    ds = ds.shuffle(2*batch_size)
    return ds

def get_ds_csv(csv_path, batch_size, n_samples, labeled=True):
    ### n_samples=-1 means "take all"
    if labeled:
        ds = tf.data.experimental.make_csv_dataset(
            csv_path,
            batch_size=4, # arbitrary
            shuffle=False,
            num_epochs=1, # to prevent repeat()
            label_name='binds')
    else:
        ds = tf.data.experimental.make_csv_dataset(
            csv_path,
            batch_size=4,   # arbitrary
            num_epochs=1, # to prevent repeat()
            shuffle=False)
    ds = ds.unbatch()
    if n_samples!=-1:
        ds = ds.take(n_samples)
    if labeled:
        ds = ds.map(_parse_csv_hash_oneHot_binds)
    else:
        ds = ds.map(_parse_csv_hash_oneHot)
    ds = ds.batch(batch_size)
    return ds

if use_tfRecords:
    print('Creating TFRecords file...')
    ds = get_ds_tfRecord(train_path, preproc_train_path, batch_size=BATCH_SIZE, n_entries=N_TRAIN)
else:
    print('Use CSV')
    with tf.device('/device:GPU:0'):
        ds = get_ds_csv(train_path, batch_size=BATCH_SIZE, n_samples=N_TRAIN)

#for elem in ds.take(1):
#    print(elem)
#    print(" ")

## First naive model

In [None]:
# Calculate proportion of positive binds
# (Important for training weights)
n_samples = ds.unbatch().reduce(0, lambda i,_: i+1).numpy()
n_binds = ds.unbatch().reduce(0, lambda i, data: i+int(data[1])).numpy()[0]
print(f' {n_binds} of {n_samples} are positive')
binds_rate = n_binds / n_samples

Prepare Dataset for training:

In [None]:
# if TPU is available:
#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.tpu.experimental.initialize_tpu_system(tpu)
#tpu_strategy = tf.distribute.TPUStrategy(tpu)
#with tpu_strategy.scope():

mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    hash_inputs = tf.keras.Input((N_BITS_FINGERPRINT,), name='hash')
    protein_inputs = tf.keras.Input((3,), name='oneHot')
    inputs = tf.keras.layers.concatenate([hash_inputs, protein_inputs])
    x = inputs
    x = tf.keras.layers.Dense(N_Layer_1, activation="sigmoid")(x)
    x = tf.keras.layers.Dropout(0.7)(x)
    x = tf.keras.layers.Dense(N_Layer_2, activation="sigmoid")(x)
    x = tf.keras.layers.Dropout(0.7)(x)
    x = tf.keras.layers.Dense(N_Layer_3, activation="sigmoid")(x)
    x = tf.keras.layers.Dropout(0.7)(x)
    x = tf.keras.layers.Dense(N_Layer_4, activation="sigmoid")(x)
    outputs = tf.keras.layers.Dense(1)(x)

    model = tf.keras.Model(inputs=[hash_inputs, protein_inputs], outputs=outputs)
    #print(model.summary())
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(optimizer='adam',
                  loss=loss_fn,
                  metrics=['accuracy'])
    
model.fit(ds, epochs=N_EPOCHS, class_weight={0: 1, 1: 1/binds_rate})

If you want to continue/retake training:

In [None]:
#model2 = tf.keras.models.clone_model(model)
#model2.fit(hash_ds, epochs=10, class_weight={0: 1, 1: 1/binds_rate})

## Predict on Test Set

In [None]:
test_ds = get_ds_csv(test_path, batch_size=512, n_samples=N_TEST, labeled=False)
y = model.predict(test_ds)

## Write Submission File

In [None]:
import pandas as pd

yy = y[:,0]
d = pd.DataFrame({'id': range(len(yy)), 'binds': yy})
d['id'] = d['id'] + 295246830
from scipy.special import expit, logit
d['binds'] = expit(d['binds'])
d.to_csv('submission.csv', index=False, header=True)
print(d['binds'].describe())