In [28]:
from chembl_webresource_client.new_client import new_client
import tensorflow as tf
import pandas as pd
import numpy as np


2023-05-25 23:23:15.375307: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
molecules = new_client.molecule
mols = molecules.filter(molecule_properties__num_ro5_violations=0).only(['molecule_chembl_id', 'molecule_structures'])

In [23]:
max_length = 50
array_size = 100_000
smiles = [mol['molecule_structures']['canonical_smiles'] for mol in mols[:array_size]]
filtered_smiles = [smile for smile in smiles if len(smile) <= max_length]
print(len(mols), len(smiles), len(filtered_smiles))

1631753 100000 74395


In [None]:
def one_hot_encoding(smiles):
    num_samples = len(smiles)

    unique_chars = list(set(''.join(smiles)))
    encoding_dim = len(unique_chars) + 1

    max_length = max([len(smile) for smile in smiles])

    input = np.zeros((num_samples, max_length, encoding_dim))

    for counter, smile in enumerate(smiles):
        for row, char in enumerate(smile):
            col = unique_chars.index(char)
            input[counter, row, col] = 1
        # pad with last basis vector
        for row in range(len(smile), max_length): 
            input[counter, col, encoding_dim-1] = 1

    input = tf.convert_to_tensor(input)

    return input, unique_chars

input, unique_chars = one_hot_encoding(filtered_smiles)

In [30]:
input.shape

TensorShape([74395, 50, 38])

In [36]:
train_percent = 0.9
train_cutoff = int(input.shape[0] * train_percent)
train, test = input[:train_cutoff, :, :], input[train_cutoff:, :, :]

In [37]:
train.shape, test.shape

(TensorShape([66955, 50, 38]), TensorShape([7440, 50, 38]))

In [42]:
class AutoEncoder(tf.keras.Model):
  def __init__(self):
    super().__init__()

    self.leakyrelualpha = 0.5
    self.dropoutrate = 0.6

    # LSTM STAGE
    self.lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))
    self.lstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512))
    self.dense0 = tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))

    # DIM. REDUCTION STAGE
    self.dense1 = tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense2 = tf.keras.layers.Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense3 = tf.keras.layers.Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))

    # SAMPLING STAGE
    # NOT PRESENT IN THIS MODEL

    # DECODING STAGE
    self.dense4 = tf.keras.layers.Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense5 = tf.keras.layers.Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense6 = tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense7 = tf.keras.layers.Dense(64, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense8 = tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense9 = tf.keras.layers.Dense(256, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense10 = tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense11 = tf.keras.layers.Dense(1024, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense12 = tf.keras.layers.Dense(2048, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.dense13 = tf.keras.layers.Dense(4096, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))

    # DESIRED DIMENSIONS
    self.dense14 = tf.keras.layers.Dense(50*38, activation=tf.keras.layers.LeakyReLU(alpha=self.leakyrelualpha), kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.L2(1e-4))
    self.reshape1 = tf.keras.layers.Reshape(target_shape=(50, 38))
    self.softmax_output = tf.keras.layers.Softmax()

    # DROPOUT LAYERS
    self.dropout1 = tf.keras.layers.Dropout(self.dropoutrate)
    self.dropout2 = tf.keras.layers.Dropout(self.dropoutrate)
    self.dropout3 = tf.keras.layers.Dropout(self.dropoutrate)
    self.dropout4 = tf.keras.layers.Dropout(self.dropoutrate)
    self.dropout5 = tf.keras.layers.Dropout(self.dropoutrate)
    self.dropout6 = tf.keras.layers.Dropout(self.dropoutrate)
    self.dropout7 = tf.keras.layers.Dropout(self.dropoutrate)

  def call(self, inputs, training=False):
    #x = tf.keras.Input(shape=(200, 44))(inputs)
    x = inputs
    
    # LSTMS
    x = self.lstm1(x)
    x = self.lstm2(x)
    x = self.dense0(x)

    # ENCODING
    x = self.dense1(x)
    if training: x = self.dropout1(x)
    x = self.dense2(x)
    x = self.dense3(x)
    if training: x = self.dropout2(x)

    # DECODING
    x = self.dense4(x)
    x = self.dense5(x)
    if training: x = self.dropout3(x)
    x = self.dense6(x)
    x = self.dense7(x)
    if training: x = self.dropout4(x)
    x = self.dense8(x)
    x = self.dense9(x)
    if training: x = self.dropout5(x)
    x = self.dense10(x)
    x = self.dense11(x)
    if training: x = self.dropout6(x)
    x = self.dense12(x)
    if training: x = self.dropout7(x)
    x = self.dense13(x)

    # OUTPUT
    x = self.dense14(x)
    x = self.reshape1(x)
    x = self.softmax_output(x)

    return x

In [43]:
model = AutoEncoder()

In [44]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-7, use_ema=True), loss=tf.keras.losses.CategoricalCrossentropy(), metrics = ['accuracy'])

In [45]:
model.build(train.shape)

In [46]:
model.summary()

Model: "auto_encoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  multiple                 171008    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  multiple                 3149824   
 nal)                                                            
                                                                 
 dense_15 (Dense)            multiple                  131200    
                                                                 
 dense_16 (Dense)            multiple                  4128      
                                                                 
 dense_17 (Dense)            multiple                  528       
                                                                 
 dense_18 (Dense)            multiple               

In [53]:
hist = model.fit(train[:10_000,:,:], train[:10_000, :, :], validation_split = 0.2, batch_size=4, epochs=10)

Epoch 1/10


2023-05-25 23:38:29.458146: W tensorflow/tsl/framework/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 32.00MiB (rounded to 33554432)requested by op gradient_tape/dense_28/kernel/Regularizer/mul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-05-25 23:38:29.458248: I tensorflow/tsl/framework/bfc_allocator.cc:1034] BFCAllocator dump for GPU_0_bfc
2023-05-25 23:38:29.458290: I tensorflow/tsl/framework/bfc_allocator.cc:1041] Bin (256): 	Total Chunks: 140, Chunks in use: 140. 35.0KiB allocated for chunks. 35.0KiB in use in bin. 4.1KiB client-requested in use in bin.
2023-05-25 23:38:29.458322: I tensorflow/tsl/framework/bfc_allocator.cc:1041] Bin (512): 	Total Chunks: 10, Chunks in use: 10. 6.0KiB allocated for chunks. 6.0KiB in use in bin. 5.0KiB client-requested in use in bin.
2023-05-25 23:38:2

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/dense_28/kernel/Regularizer/mul' defined at (most recent call last):
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_4574/550852192.py", line 1, in <module>
      hist = model.fit(train[:10_000,:,:], train[:10_000, :, :], validation_split = 0.2, batch_size=4, epochs=10)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/engine/training.py", line 1027, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 526, in minimize
      grads_and_vars = self.compute_gradients(loss, var_list, tape)
    File "/home/diogo/anaconda3/envs/chem/lib/python3.10/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 259, in compute_gradients
      grads = tape.gradient(loss, var_list)
Node: 'gradient_tape/dense_28/kernel/Regularizer/mul'
failed to allocate memory
	 [[{{node gradient_tape/dense_28/kernel/Regularizer/mul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_30646]

In [None]:
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()