In [91]:
import tensorflow as tf
from tqdm import tqdm
import pickle

import pandas as pd

In [1]:
import pydot

In [3]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64
image_shape_1 = 100
image_shape_2 = image_shape_1

# Make a dataset

In [4]:
# loading
with open('data/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [5]:
vocab_size = max(tokenizer.index_word.keys())+1
vocab_size

40

In [6]:
# loading
with open('data/train_preprocessed.pickle', 'rb') as handle:
    data = pickle.load(handle)
    paths = data["paths"]
    train_token = data["train_token"]
train_token.shape

(2424186, 396)

In [7]:
max_len = train_token.shape[1]
max_len

396

## Create DS

In [8]:
image_count = len(paths)
image_count

2424186

In [11]:
list_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(paths, name="img_path"), tf.convert_to_tensor(train_token, name="token")))
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

#for f in list_ds.take(5):
#    print(f)
list_ds.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(396,), dtype=tf.int32, name=None))

## Split DS

In [12]:
val_size = int(image_count * 0.1)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)

print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

2181768
242418


In [13]:
num_steps = tf.data.experimental.cardinality(train_ds).numpy() // BATCH_SIZE
num_steps

34090

In [14]:
autotune = tf.data.AUTOTUNE

In [15]:
def map_func(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=1)
    
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, (image_shape_1, image_shape_2))
    
    #img.set_shape([image_shape_1, image_shape_2, 1])
    #label.set_shape([max_len])
    return img, label

In [16]:
train_ds = train_ds.map(map_func, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(map_func, num_parallel_calls=tf.data.AUTOTUNE)
list_ds = list_ds.map(map_func, num_parallel_calls=tf.data.AUTOTUNE)

In [17]:
train_ds.element_spec

(TensorSpec(shape=(100, 100, 1), dtype=tf.float32, name=None),
 TensorSpec(shape=(396,), dtype=tf.int32, name=None))

In [18]:
def _set_shapes(img, labels):
    img.set_shape([None, image_shape_1, image_shape_2, 1])
    labels.set_shape([None, 396, 1])
    return img, labels

In [19]:
def configure_for_performance(ds, autotune=tf.data.AUTOTUNE):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=BUFFER_SIZE)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=autotune)
    return ds

In [20]:
train_ds = configure_for_performance(train_ds, autotune=autotune)
val_ds = configure_for_performance(val_ds, autotune=autotune)
list_ds = configure_for_performance(list_ds, autotune=autotune)

# Model definition

# ***** Encoder goes here ******

In [21]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(64, 3, input_shape=(image_shape_1, image_shape_2, 1), activation='relu'))
model.add(tf.keras.layers.Conv2D(64, 5, activation='relu'))
model.add(tf.keras.layers.Conv2D(5, 5, activation='relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(features_shape, activation="relu"))
model.add(tf.keras.layers.RepeatVector(max_len))
model.add(tf.keras.layers.GRU(units, return_sequences=True, return_state=False, recurrent_initializer="glorot_uniform"))
model.add(tf.keras.layers.Dense(units))
model.add(tf.keras.layers.Dense(vocab_size, activation="softmax"))

model.output_shape

(None, 396, 40)

In [22]:
tf.keras.utils.plot_model(model, show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 98, 98, 64)        640       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 94, 94, 64)        102464    
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 90, 90, 5)         8005      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 45, 45, 5)         0         
_________________________________________________________________
flatten (Flatten)            (None, 10125)             0         
_________________________________________________________________
dense (Dense)                (None, 2048)              20738048  
_________________________________________________________________
repeat_vector (RepeatVector) (None, 396, 2048)         0

## Training

In [24]:
epochs = 1

callbacks = [
    tf.keras.callbacks.ModelCheckpoint("checkpoints/cnn_rnn_baseline_{epoch}.h5"),
]
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)
model.fit(
    train_ds, epochs=epochs, callbacks=callbacks, validation_data=val_ds,
)

    7/34091 [..............................] - ETA: 153:12:09 - loss: 3.2696 - accuracy: 0.3319

KeyboardInterrupt: 

In [69]:
def postproc(x):
    return [a.replace(" ", "") for a in x]

def convert_res(res):
    res_np = res.numpy()
    res_idx = np.argmax(res_np, axis=2)
    return postproc(tokenizer.sequences_to_texts(res_idx))
    

In [75]:
pred = []
y_true = []
for imgs, targets in tqdm(val_ds):
    res = model(imgs)
    pred.extend(convert_res(res))
    y_true.extend(postproc(tokenizer.sequences_to_texts(targets.numpy())))

  1%|          | 40/3788 [12:41<16:57:29, 16.29s/it]  Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x19d92c160>
Traceback (most recent call last):
  File "/Users/lautenschlager/Playground/kaggle-bms-molecular-translation/venv/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 534, in __del__
    gen_dataset_ops.delete_iterator(
  File "/Users/lautenschlager/Playground/kaggle-bms-molecular-translation/venv/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1263, in delete_iterator
    _result = pywrap_tfe.TFE_Py_FastPathExecute(
KeyboardInterrupt: 
  1%|          | 40/3788 [18:09<28:21:45, 27.24s/it]


KeyboardInterrupt: 

In [76]:
pred

['qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq',
 'qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq',
 'qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq

# process test data

In [78]:
rootpath = "data/bms-molecular-translation/"
test = pd.read_csv(rootpath + "sample_submission.csv")

def get_test_file_path(image_id):
    return rootpath + "test/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

test['file_path'] = test['image_id'].apply(get_test_file_path)

print(f'test.shape: {test.shape}')
display(test.head())

test.shape: (1616107, 3)


Unnamed: 0,image_id,InChI,file_path
0,00000d2a601c,InChI=1S/H2O/h1H2,data/bms-molecular-translation/test/0/0/0/0000...
1,00001f7fc849,InChI=1S/H2O/h1H2,data/bms-molecular-translation/test/0/0/0/0000...
2,000037687605,InChI=1S/H2O/h1H2,data/bms-molecular-translation/test/0/0/0/0000...
3,00004b6d55b6,InChI=1S/H2O/h1H2,data/bms-molecular-translation/test/0/0/0/0000...
4,00004df0fe53,InChI=1S/H2O/h1H2,data/bms-molecular-translation/test/0/0/0/0000...


In [79]:
test_ds = tf.data.Dataset.from_tensor_slices((test["file_path"], test["image_id"]))

In [82]:
def map_func_test(image_path, img_id):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=1)
    
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, (image_shape_1, image_shape_2))
    
    #img.set_shape([image_shape_1, image_shape_2, 1])
    #label.set_shape([max_len])
    return img, img_id

In [83]:
test_ds = test_ds.map(map_func_test, num_parallel_calls=tf.data.AUTOTUNE)

In [84]:
test_ds = configure_for_performance(test_ds, autotune=autotune)

In [85]:
def postproc(x):
    return ["InChI=1S/" + a.replace(" ", "").replace("<", "").replace(">", "").replace("q", "") for a in x]

def convert_res(res):
    res_np = res.numpy()
    res_idx = np.argmax(res_np, axis=2)
    return postproc(tokenizer.sequences_to_texts(res_idx))

In [89]:
pred_test = []
id_test = []
for imgs, ids in tqdm(test_ds):
    res = model(imgs)
    pred_test.extend(convert_res(res))
    id_test.extend(ids.numpy())
    break

  0%|          | 0/25252 [00:27<?, ?it/s]


In [94]:
id_test[0].decode("utf8")

'0023dd05f78f'

In [95]:
test[test.image_id=='0023dd05f78f']

Unnamed: 0,image_id,InChI,file_path
829,0023dd05f78f,InChI=1S/H2O/h1H2,data/bms-molecular-translation/test/0/0/2/0023...


In [99]:
test_result = pd.DataFrame(zip([a.decode("utf8") for a in id_test], pred_test), columns=("image_id", "InChI"))

In [None]:
test_result.to_csv("submission.csv")