# Inference using GUI

Libraries and global paths.


In [1]:
!pip install gradio
!pip install pickle5

import os
import json
import pickle5 as pickle
import numpy as np
import tensorflow as tf
import gradio as gr
import tensorflow.keras as keras

from PIL import Image, ImageFont, ImageDraw, ImageOps
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Layer, RepeatVector, TimeDistributed, Bidirectional, Concatenate, Reshape, Input, Embedding, LSTM, Dense, Dropout, Add, LayerNormalization
from tensorflow.keras.metrics import Mean
from tensorflow.keras.applications import Xception
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


# Model dump paths
root_dir = os.path.join('drive', 'MyDrive')
weights_dir = os.path.join(root_dir, 'weights')

vision_model_path = os.path.join(weights_dir, 'vision_model')
text_model_path = os.path.join(weights_dir, 'text_model')
decoder_model_path = os.path.join(weights_dir, 'decoder_model')


# Global vars
words_per_caption = 23
images_per_story = 5


Collecting gradio
[?25l  Downloading https://files.pythonhosted.org/packages/a2/31/9fc0bfcfb5e3be94350917640a709daca53ab3b35440d4ed67e60bf05567/gradio-2.1.2-py3-none-any.whl (2.5MB)
[K     |▏                               | 10kB 15.8MB/s eta 0:00:01[K     |▎                               | 20kB 19.5MB/s eta 0:00:01[K     |▍                               | 30kB 22.2MB/s eta 0:00:01[K     |▌                               | 40kB 24.4MB/s eta 0:00:01[K     |▋                               | 51kB 27.0MB/s eta 0:00:01[K     |▉                               | 61kB 28.4MB/s eta 0:00:01[K     |█                               | 71kB 28.8MB/s eta 0:00:01[K     |█                               | 81kB 29.5MB/s eta 0:00:01[K     |█▏                              | 92kB 30.1MB/s eta 0:00:01[K     |█▎                              | 102kB 26.5MB/s eta 0:00:01[K     |█▍                              | 112kB 26.5MB/s eta 0:00:01[K     |█▋                              | 122kB 26.5MB

GUI renderer and model inference callback methods.

In [2]:
def retrieve_tokenizer():
    tokenizer_dump_path = os.path.join(root_dir, 'tokenizer.pickle')
    if os.path.isfile(tokenizer_dump_path):
        # Tokenizer already exists. Load and return
        with open(tokenizer_dump_path, 'rb') as handle:
            tokenizer = pickle.load(handle)
        return tokenizer

    # No tokenizer dump found. Create tokenizer
    print('No tokenizer found! Abording...')
    # return create_and_dump_vocab()

def run_visually():
    image1 = gr.inputs.Image()
    image2 = gr.inputs.Image()
    image3 = gr.inputs.Image()
    image4 = gr.inputs.Image()
    image5 = gr.inputs.Image()

    iface = gr.Interface(
        fn=predict_using_gui,
        inputs=[image1, image2, image3, image4, image5],
        outputs=gr.outputs.Textbox()
    )
    iface.launch(share=True, debug=True)


def predict_using_gui(image1, image2, image3, image4, image5):
    # Fetch vocabulary and word mapping
    tokenizer = retrieve_tokenizer()
    word_to_index = tokenizer.word_index

    for word, occurrences in tokenizer.word_counts.items():
        if occurrences <= 4:
            del word_to_index[word]

    # Read one example
    vision_encoder_input = np.stack([tf.image.resize(keras.backend.constant(np.array(image1)), size=(299, 299)).numpy(),
                                     tf.image.resize(keras.backend.constant(np.array(image2)), size=(299, 299)).numpy(),
                                     tf.image.resize(keras.backend.constant(np.array(image3)), size=(299, 299)).numpy(),
                                     tf.image.resize(keras.backend.constant(np.array(image4)), size=(299, 299)).numpy(),
                                     tf.image.resize(keras.backend.constant(np.array(image5)), size=(299, 299)).numpy()], axis=0)

    # Run the example's images through the image encoder model
    vision_encoder_input = np.expand_dims(vision_encoder_input, axis=0)
    h_vision, c_vision = image_encoder.predict(vision_encoder_input)

    # Start with just the [start] token as input
    # in the first LSTM cell of the decoder model
    prev_predicted_word = np.array([word_to_index['[start]']])
    prev_predicted_word = np.expand_dims(prev_predicted_word, axis=0)

    # Initialize the hidden state of the decoder with the
    # hidden state of the image encoder
    h = h_vision
    c = c_vision

    # Initialize text encoder and decoder hidden states
    h_text = np.zeros_like(h)
    c_text = np.zeros_like(c)
    h_second_stack = np.zeros_like(h)
    c_second_stack = np.zeros_like(c)

    predicted_story = []
    for _ in range(images_per_story):
        predicted_words_for_image = []

        for word_count in range(words_per_caption):
            # Run the previously predicted word through the text encoder
            # keeping the hidden states of the LSTM cells
            prev_predicted_word, h_text, c_text = text_model.predict([prev_predicted_word, h_text, c_text])

            # Run through the decoder, keeping track of hidden states,
            # and save the word with the highest probability
            softmax_activations, h, c, h_second_stack, c_second_stack = decoder_model.predict(
                [h, c, h_second_stack, c_second_stack, prev_predicted_word])

            second_best = np.squeeze(softmax_activations).argsort()[-2]
            prev_predicted_word = np.argmax(softmax_activations, axis=-1)
            predicted_word_idx = prev_predicted_word.item()

            if predicted_word_idx == word_to_index['[end]']:
                break

            if predicted_word_idx == word_to_index['[UNK]']:
                predicted_word_idx = second_best
                prev_predicted_word = np.expand_dims(second_best, axis=0)

            # Save predicted word index
            predicted_words_for_image.append(predicted_word_idx)

        # Combine hidden states of the decoder with the hidden states of
        # the text encoder at the end of the prediction for 1 photo
        h = h + h_text
        c = c + c_text

        # Save predicted sentence
        predicted_story.append(
            [word for word in predicted_words_for_image if word != word_to_index['[end]'] and word != 0])

    # Map indices to words and pretty print
    return '\n'.join(tokenizer.sequences_to_texts(predicted_story))

In [4]:
text_model = keras.models.load_model(text_model_path, compile=False)
image_encoder = keras.models.load_model(vision_model_path, compile=False)
decoder_model = keras.models.load_model(decoder_model_path, compile=False)
print('Restored model!')
run_visually()

Restored model!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
This share link will expire in 24 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted (NEW!)
Running on External URL: https://39124.gradio.app
Interface loading below...


KeyboardInterrupt: ignored