In [1]:
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
#from sklearn.model_selection import train_test_split
#from sklearn.utils import shuffle

import collections
import random
import re
import numpy as np
import pandas as pd
import os
import time
import json
from glob import glob
from PIL import Image
import pickle

from tqdm.auto import tqdm
tqdm.pandas()

from pathlib import Path

#import Levenshtein

import csv

In [2]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64
image_shape_1 = 100
image_shape_2 = image_shape_1

# Make a dataset

In [3]:
rootpath = "data/bms-molecular-translation/"

In [4]:
train = pd.read_csv(rootpath + "train_labels.csv")

In [5]:
def get_train_file_path(image_id):
    return rootpath + "train/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

train['file_path'] = train['image_id'].apply(get_train_file_path)

print(f'train.shape: {train.shape}')
display(train.head())

train.shape: (2424186, 3)


Unnamed: 0,image_id,InChI,file_path
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,data/bms-molecular-translation/train/0/0/0/000...
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,data/bms-molecular-translation/train/0/0/0/000...
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,data/bms-molecular-translation/train/0/0/0/000...
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,data/bms-molecular-translation/train/0/0/0/000...
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,data/bms-molecular-translation/train/0/0/0/000...


## Preprocess label, tokenize etc

In [6]:
def preprocess_label(InChI):
    return "<{}>".format(InChI.replace("InChI=1S/", ""))

In [7]:
train["InChI_clean"] = train["InChI"].apply(preprocess_label)

In [8]:
train.head()

Unnamed: 0,image_id,InChI,file_path,InChI_clean
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,data/bms-molecular-translation/train/0/0/0/000...,<C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4...
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,data/bms-molecular-translation/train/0/0/0/000...,<C21H30O4/c1-12(22)25-14-6-8-20(2)13(10-14)11-...
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,data/bms-molecular-translation/train/0/0/0/000...,<C24H23N5O4/c1-14-13-15(7-8-17(14)28-12-10-20(...
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,data/bms-molecular-translation/train/0/0/0/000...,<C17H24N2O4S/c1-12(20)18-13(14-7-6-10-24-14)11...
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,data/bms-molecular-translation/train/0/0/0/000...,<C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7-8)5-2-3...


In [9]:
# tokenizing
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="q", filters=' ', lower=False, char_level=True)
tokenizer.fit_on_texts(train["InChI_clean"].values)
tokenizer.word_index['q'] = 0
tokenizer.index_word[0] = 'q'

In [10]:
vocab_size = max(tokenizer.index_word.keys())+1
vocab_size

40

In [11]:
train_token = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train["InChI_clean"].values), padding='post')
train_token.shape

(2424186, 396)

In [None]:
max_len = train_token.shape[1]
max_len

## Create DS

In [12]:
paths = train["file_path"]

In [13]:
image_count = len(paths)
image_count

2424186

In [14]:
list_ds = tf.data.Dataset.from_tensor_slices((paths, train_token))
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

#for f in list_ds.take(5):
#    print(f)

## Split DS

In [15]:
val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)

print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

1939349
484837


In [16]:
num_steps = tf.data.experimental.cardinality(train_ds).numpy() // BATCH_SIZE
num_steps

30302

In [17]:
autotune = tf.data.AUTOTUNE

In [18]:
def map_func(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img)
    img = tf.image.resize(img, (image_shape_1, image_shape_2))
    return img, label

In [19]:
train_ds = train_ds.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.AUTOTUNE)
list_ds = list_ds.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.AUTOTUNE)

In [20]:
def configure_for_performance(ds, autotune=tf.data.AUTOTUNE):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=BUFFER_SIZE)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=autotune)
    return ds

In [21]:
train_ds = configure_for_performance(train_ds, autotune=autotune)
val_ds = configure_for_performance(val_ds, autotune=autotune)
list_ds = configure_for_performance(list_ds, autotune=autotune)

# Model definition

# ***** Encoder goes here ******

In [22]:
encoder = tf.keras.Sequential()
encoder.add(tf.keras.layers.Conv2D(64, 3, input_shape=(image_shape_1, image_shape_2, 1), activation='relu'))
encoder.add(tf.keras.layers.Conv2D(64, 5, activation='relu'))
encoder.add(tf.keras.layers.Flatten())
encoder.add(tf.keras.layers.Dense(features_shape, activation="relu"))

encoder.output_shape

(None, 2048)

In [23]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=False,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

 
    def call(self, x, features, hidden):
        # defining attention as a separate model

        # x shape == (batch_size, embedding_dim)
        x = self.embedding(x)
        #x = tf.reshape(x, [x.shape[0], x.shape[2]])

        # x shape after concatenation == (batch_size, embedding_dim + features)
        x = tf.concat([tf.expand_dims(features, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        # output shape = (batch_size, units)
        output, state = self.gru(x, initial_state=hidden)

        # shape == (batch_size, units)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        #x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc2(x)

        return x, state

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [24]:
decoder = RNN_Decoder(embedding_dim, units, vocab_size)
#decoder.output_shape

## Training

In [25]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [26]:
checkpoint_path = "checkpoints/attention_rnn_train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [27]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    # restoring the latest checkpoint in checkpoint_path
    ckpt.restore(ckpt_manager.latest_checkpoint)

In [28]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []

In [29]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([tokenizer.word_index['<']] * target.shape[0], 1)

    with tf.GradientTape() as tape:
        features = encoder(img_tensor)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)
            
            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [None]:
EPOCHS = 1

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(train_ds):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            print ('Epoch {} Batch {}/{} Loss {:.4f}'.format(
              epoch + 1, batch, num_steps, batch_loss.numpy() / int(target.shape[1])))
        if batch % 500 == 0 & batch > 0:
            break
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    if epoch % 1 == 0:
        ckpt_manager.save()

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/num_steps))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0/30302 Loss 0.9994
Epoch 1 Batch 100/30302 Loss 0.8416
Epoch 1 Batch 200/30302 Loss 0.8015


In [1]:
val_ds.batch(1)

NameError: name 'val_ds' is not defined

In [2]:
val_predictions

NameError: name 'val_predictions' is not defined

In [None]:
for img_tensor, target in val_ds:
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([tokenizer.word_index['<']] * target.shape[0], 1)
    
    features = encoder(img_tensor)
    pred_tens = tf.zeros(target.shape)
    
    for i in range(0, target.shape[1]):
        # passing the features through the decoder
        predictions, hidden = decoder(dec_input, features, hidden)
        
        pred_tens[:,i] = predictions

        # using teacher forcing
        #dec_input = tf.expand_dims(target[:, i], 1)
        dec_input = predictions
    
    val_predictions.append((pred_tens.numpy(), targ))