# Image Captioning with Tensorflow

# Imports

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, RepeatVector, Dense, LSTM
from tensorflow.keras.layers import Embedding, Dropout, TimeDistributed, Concatenate
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import Adam
from keras.layers.merge import add
from tensorflow.keras.callbacks import ModelCheckpoint
import os
import numpy as np
from PIL import Image

# Image Feature Extraction

In [None]:
%%time

# Get all filenames of the images
folder = "../input/flickr8k/Flickr_Data/Flickr_Data/Images/"
images = os.listdir(folder)

# Load the CNN Architecture with Imagenet as weights
image_model = ResNet50(weights='../input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels.h5')
model_new = tf.keras.Model(image_model.input,image_model.layers[-2].output)

# Store image features in dictionary
img_features = dict()  
for img in images: 
    img1 = image.load_img(folder + img, target_size=(224, 224))
    x = image.img_to_array(img1)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    fea_x = model_new.predict(x)
    fea_x1 = np.reshape(fea_x , fea_x.shape[1])
    img_features[img] = fea_x1

# Caption Pre-Processing

In [None]:
# Get All Captions
fn = "../input/flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt"
f = open(fn, 'r')
capts = f.read()
#Group all captions by filename, for references
captions = dict()
i = 0

try:
    for line in capts.split("\n"):
        txt = line.split('\t')
        fn = txt[0].split('#')[0]
        if fn not in captions.keys():
            captions[fn] = [txt[1]]
        else:
            captions[fn].append(txt[1])
        i += 1
except:
    pass #pass Model
    

def getCaptions(path):
    
    f = open(path, 'r')
    capts = f.read()
    desc = dict()

    try:
        for line in capts.split("\n"):
            image_id = line
            image_descs = captions[image_id]

            for des in image_descs:
                ws = des.split(" ")
                w = [word for word in ws if word.isalpha()]
                des = "startseq " + " ".join(w) + " endseq"
                if image_id not in desc:
                    desc[image_id] = list()
                desc[image_id].append(des)
    except:
        pass
    
    return desc

# Split captions
train_caps = getCaptions("../input/flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt")
val_caps = getCaptions("../input/flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.devImages.txt")

## Tokenization

In [None]:
# Preparing to make word-index and index-word
# (adding all training captions to a list)
train_captions = []
for key, desc_list in train_caps.items():
    for i in range(len(desc_list)):
        train_captions.append(desc_list[i])

# Tokenize top 5000 words in Train Captions
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size,
                      oov_token="<unk>",
                      filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
word_index = tokenizer.word_index
index_word = tokenizer.index_word

## Images + Image Features

In [None]:
%%time

train_fns = list(train_caps.keys())
train_set = dict((k, img_features[k]) for k in train_fns)
val_fns = list(val_caps.keys())
val_set = dict((k, img_features[k]) for k in val_fns)
fn_test = "../input/flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.testImages.txt"
f = open(fn_test, 'r')
t = f.read()

test_fns= t.split("\n")
test_set = dict((k, img_features[k]) for k in list(test_fns[:-1]))

# Final Training & Validation Data

In [None]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
max_length = 34

In [None]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [in_img, in_seq], out_word

In [None]:
# test the data generator
generator = data_generator(train_caps, train_set, tokenizer, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

# Word Embedding Matrix

In [None]:
%%time

# Load Glove vectors
embeddings_index = {} # empty dictionary
f = open("../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt", encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
vocab_size = len(word_index) + 1
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

# Model Architecture

In [None]:
image_model = tf.keras.models.Sequential()

image_model.add(Dense(embedding_dim, input_shape=(2048,), activation='relu'))
image_model.add(RepeatVector(max_length))

language_model = tf.keras.models.Sequential()

language_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
language_model.add(LSTM(256, return_sequences=True))
language_model.add(TimeDistributed(Dense(embedding_dim)))

conca = Concatenate()([image_model.output, language_model.output])
x = LSTM(128, return_sequences=True)(conca)
x = LSTM(512, return_sequences=False)(x)
x = Dense(vocab_size)(x)
out = Activation('softmax')(x)
model_1 = Model(inputs=[image_model.input, language_model.input], outputs = out)

model_1.layers[2].set_weights([embedding_matrix])
model_1.layers[2].trainable = False

model_1.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate = 0.0001), metrics=['accuracy'])

# Training

In [None]:
%%time

# train the model, run epochs manually and save after each epoch
epochs = 50
steps = len(train_caps)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_caps, train_set, tokenizer, max_length, vocab_size)
    # fit for one epoch
    model_1.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

# Inference

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    return tokenizer.index_word.get(integer)

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #print("sequence after tok: ", sequence)
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        if i==0:
            photo = np.expand_dims(photo, axis=0)
        #print("photo: ", photo)
        #print("sequence: ", sequence)
        yhat = model.predict([photo, sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

In [None]:
# test the predict function
tmpimg1 = np.expand_dims(np.array(photo), axis=0)
print(tmpimg1.shape)
tmpcap1 = pad_sequences([[3]], maxlen=max_length)
print(tmpcap1.shape)
tmpout1 = model_1.predict([tmpimg1, tmpcap1], verbose=0)
print(tmpout1.shape)
print(tmpout1)

In [None]:
def image_to_feat_vec(imagePath):
    img1 = image.load_img(imagePath, target_size=(224, 224))
    x = image.img_to_array(img1)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    fea_x = model_new.predict(x)
    fea_x1 = np.reshape(fea_x , fea_x.shape[1])
    return fea_x1

imagePath = "../input/garage-detection-unofficial-ssl-challenge/GarageImages/GarageImages/image1086.jpg"
photo = image_to_feat_vec(imagePath)
print("Predicted Caption:", generate_desc(model_1, tokenizer, photo, max_length))
Image.open(imagePath)