In [3]:
import tensorflow as tf
import pandas as pd
import os

## Switches:

In [4]:
create_files = True
submission = True
if submission:
    create_files = True

## Select Hyperparameters

In [5]:
N_BITS_FINGERPRINT = 512
BATCH_SIZE = 128
N_TRAIN = BATCH_SIZE * 100 # set to -1 for "all"
N_TEST = 100000 # set to -1 for "all"

if submission:
    N_TRAIN = 1000000
    N_TEST = -1

## Create TFRecords File

In [6]:
train_path = '/kaggle/input/leash-BELKA/train.csv'
test_path = '/kaggle/input/leash-BELKA/test.csv'
prerproc_train_path = "/kaggle/working/train_prepro.tfrecord"
prerproc_test_path = "/kaggle/working/test_prepro.tfrecord"

Preprocessed file does not need to be created over and over again:

In [7]:
def create_preproc_file(path_in, path_out, n_entries = -1):
    
    # rdkit helps generating characteristics of molecules:
    !pip install rdkit
    import rdkit
    import rdkit.Chem as Chem
    from rdkit.Chem import AllChem

    def generate_ecfp(molecule, radius=2, bits=N_BITS_FINGERPRINT):
        if molecule is None:
            return None
        return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

    # The following functions can be used to convert a value to a type compatible
    # with tf.train.Example.
    def _bytes_feature(value):
      """Returns a bytes_list from a string / byte."""
      if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    def _float_feature(value):
        """Returns a float_list from a float / double."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    def _int64_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    def serialize_example(id_,hash_,protein_name,binds=None):
        features = {
            'id': _int64_feature(id_),
            'hash': _bytes_feature(tf.io.serialize_tensor(tf.constant(hash_))),
            'protein_name': _bytes_feature(str.encode(protein_name)),
          }
        if binds!=None:
            features['binds'] = _int64_feature(binds)
        example = tf.train.Example(features=tf.train.Features(feature=features))
        return example.SerializeToString()

    # The actual creation of the file:
    try:
        os.remove(path_out)
    except FileNotFoundError:
        pass
    with open(path_in) as f_in, tf.io.TFRecordWriter(path_out) as writer:
        first_line = f_in.readline() # skip headers
        labeled = (len(first_line.split(",")))==7
        for i, line in enumerate(f_in):
            if i==n_entries:
                break
            if i % 10000 == 0:
                print(i)
            features = line.split(",")
            molecule = Chem.MolFromSmiles(features[4])
            ecfp = generate_ecfp(molecule)
            if labeled:
                example = serialize_example(int(features[0]),
                                        ecfp,
                                        features[5],
                                        int(features[6]))
                writer.write(example)
            else:
                example = serialize_example(int(features[0]),
                                        ecfp,
                                        features[5])
                writer.write(example)
        print('done')

if create_files:
    create_preproc_file(train_path,
                        prerproc_train_path,
                        n_entries = N_TRAIN)

  pid, fd = os.forkpty()


Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
0
done


## Functions for reading the TFRecords

In [9]:
# Create a description of the features.
def _parse_function(example, label=True):
    # Parse the input `tf.train.Example` proto using the dictionary above.
    feature_description = {
      'id':           tf.io.FixedLenFeature([], tf.int64),
      'hash':         tf.io.FixedLenFeature([], tf.string),
      'protein_name': tf.io.FixedLenFeature([], tf.string),
    }
    if label:
        feature_description['binds'] = tf.io.FixedLenFeature([1], tf.int64)
    
    features = tf.io.parse_single_example(example, feature_description)
    id_     = features['id']
    hash_   = tf.io.parse_tensor(features['hash'], out_type=tf.int32)
    hash_   = tf.reshape(hash_,[1,N_BITS_FINGERPRINT])  # needed to bring back the shape to former string-byte
    protein_name = features['protein_name']
    if label:
        return id_, hash_, protein_name, features['binds']
    else:
        return id_, hash_, protein_name

def _parse_hash_oneHot_binds(example_proto):
    id_, hash_, protein_name, binds = _parse_function(example_proto)
    oneHot = tf.math.equal(protein_name, ['BRD4', 'sEH', 'HSA'])
    oneHot   = tf.reshape(oneHot,[1,3])
    return {'hash': hash_, 'oneHot': oneHot}, binds
def _parse_hash_oneHot(example_proto):
    id_, hash_, protein_name, = _parse_function(example_proto, label=False)
    oneHot = tf.math.equal(protein_name, ['BRD4', 'sEH', 'HSA'])
    oneHot   = tf.reshape(oneHot,[1,3])
    return {'hash': hash_, 'oneHot': oneHot}
def _parse_hash_binds(example_proto):
    id_, hash_, protein_name, binds = _parse_function(example_proto)
    return hash_, binds
def _parse_hash(example_proto):
    id_, hash_, protein_name = _parse_function(example_proto, label=False)
    return hash_

raw_dataset = tf.data.TFRecordDataset(prerproc_train_path)

for example in raw_dataset.take(1):
    for p in _parse_hash_oneHot_binds(example):
        print(p)

{'hash': <tf.Tensor: shape=(1, 51), dtype=int32, numpy=
array([[1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 0]], dtype=int32)>, 'oneHot': <tf.Tensor: shape=(1, 3), dtype=bool, numpy=array([[ True, False, False]])>}
tf.Tensor([0], shape=(1,), dtype=int64)


## First stupid model

Compute characteristics of dataset:

In [10]:
hash_ds = raw_dataset.map(_parse_hash_oneHot_binds)

In [11]:
hash_ds = raw_dataset.map(_parse_hash_oneHot_binds)

# Calculate amount of positive binds
n_samples = hash_ds.reduce(0, lambda x,_: x+1).numpy()
n_binds = hash_ds.reduce(0, lambda x, data:x+int(data[1])).numpy()[0]
print(f' {n_binds} of {n_samples} are positive')
binds_rate = n_binds / n_samples
for e in hash_ds.take(1):
    print(e)

 5 of 3000 are positive
({'hash': <tf.Tensor: shape=(1, 51), dtype=int32, numpy=
array([[1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 0]], dtype=int32)>, 'oneHot': <tf.Tensor: shape=(1, 3), dtype=bool, numpy=array([[ True, False, False]])>}, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>)


Prepare Dataset for training:

In [12]:
hash_ds = hash_ds.batch(batch_size=BATCH_SIZE)
hash_ds = hash_ds.shuffle(4*BATCH_SIZE)

In [13]:
# if TPU is available:
#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.tpu.experimental.initialize_tpu_system(tpu)
#tpu_strategy = tf.distribute.TPUStrategy(tpu)
#with tpu_strategy.scope():
    
hash_inputs = tf.keras.Input((1,N_BITS_FINGERPRINT), name='hash')
protein_inputs = tf.keras.Input((1,3), name='oneHot')
inputs = tf.keras.layers.concatenate([hash_inputs, protein_inputs])
x = inputs
x = tf.keras.layers.Dense(500, activation="relu")(x)
x = tf.keras.layers.Dropout(0.7)(x)
x = tf.keras.layers.Dense(100, activation="relu")(x)
x = tf.keras.layers.Dropout(0.7)(x)
x = tf.keras.layers.Dense(10, activation="sigmoid")(x)
x = tf.keras.layers.Dropout(0.7)(x)
x = tf.keras.layers.Dense(4, activation="sigmoid")(x)
outputs = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(inputs=[hash_inputs, protein_inputs], outputs=outputs)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
model.fit(hash_ds, epochs=15, class_weight={0: 1, 1: 1/binds_rate})

Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 129ms/step - accuracy: 0.9971 - loss: 18.6395
Epoch 2/10


  self.gen.throw(typ, value, traceback)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 119ms/step - accuracy: 0.9983 - loss: 15.3130
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step - accuracy: 0.9989 - loss: 3.5626
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - accuracy: 0.9986 - loss: 12.2286
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 141ms/step - accuracy: 0.9973 - loss: 21.5450
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 131ms/step - accuracy: 0.9991 - loss: 2.1820
Epoch 7/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - accuracy: 0.9977 - loss: 12.0142
Epoch 8/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - accuracy: 0.9987 - loss: 2.5375
Epoch 9/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - accuracy: 0.9991 - loss: 4.7507
Epoch 10/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c8fec108d00>

If you want to continue/retake training:

In [None]:
#model2 = tf.keras.models.clone_model(model)
#model2.fit(hash_ds, epochs=10, class_weight={0: 1, 1: 1/binds_rate})

## Predict

In [16]:
if create_files:
    create_preproc_file(test_path,
                        "/kaggle/working/test_prepro.tfrecord",
                        n_entries=N_TEST)
test_ds = tf.data.TFRecordDataset("/kaggle/working/test_prepro.tfrecord")
test_ds = test_ds.map(_parse_hash_oneHot)
test_ds = test_ds.batch(100)
#for e in test_ds.take(1):
#    print(e)
y = model.predict(test_ds)

0
done
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [17]:
yy = [a[0][0] for a in y]
d = pd.DataFrame({'id': range(len(yy)), 'binds': yy})
d['id'] = d['id'] + 295246830
from scipy.special import expit, logit
d['binds'] = expit(d['binds'])
d.to_csv('submission.csv', index=False, header=True)
d['binds'].describe()

count    1000.000000
mean        0.494207
std         0.000083
min         0.494007
25%         0.494145
50%         0.494208
75%         0.494264
max         0.494431
Name: binds, dtype: float64