In [1]:
from numpy import argmax
from pickle import dump, load
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.backend import sum

In [2]:
# Load doc into memory
def load_doc(filename):

    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    return text

# Load the list of photo identifiers
def load_set(filename):
    
    doc = load_doc(filename)
    dataset = list()

    for line in doc.split('\n'):
        if len(line) < 1:
            continue

        identifier = line.split('.')[0]
        dataset.append(identifier)
        
    return set(dataset)

# Load clean descriptions
def load_clean_descriptions(filename, dataset):

    doc = load_doc(filename)
    descriptions = dict()
    
    for line in doc.split('\n'):

        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]

        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
                
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'

            descriptions[image_id].append(desc)
            
    return descriptions

# Creating a list of descriptions
def to_lines(descriptions):
    
    all_desc = list()
    
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
        
    return all_desc

# Fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    
    return tokenizer

# Training dataset (6K)
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

# Descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# Prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)

# Save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Dataset: 6000
Descriptions: train=6000


In [3]:
# Extract features from each photo in the directory
def extract_features(filename):
    
    # Load the model
    model = VGG16()
    
    # Re-structure the model
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    
    # Load the photo
    image = load_img(filename, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    
    feature = model.predict(image, verbose=0)
    
    return feature

# Map an integer to a word
def word_for_id(integer, tokenizer):
    
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
        
    return None

# Caption generation
def generate_desc(model, tokenizer, photo, max_length):
    
    in_text = 'startseq'
    
    for i in range(max_length):

        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)

        yhat = model.predict([photo,sequence], verbose=0)
        yhat = argmax(yhat)
        word = word_for_id(yhat, tokenizer)

        if word is None:
            break
        
        in_text += ' ' + word
        
        if word == 'endseq':
            break
            
    return in_text

In [4]:
# Tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# Max sequence length
max_length = 34

# Loading the model
model = load_model('model_19.h5')

In [5]:
# Prepare the image
photo = extract_features('example.jpg')

# Generate caption
description = generate_desc(model, tokenizer, photo, max_length)

print(description)

startseq black dog is running through the water endseq
