# Image Captioning Tutorial Using the CNN-LSTM Encoder-Decoder Model ### 


### Motivation ###
As humans, we find it really easy to describe an image that we see, so a natural question to ask is if we can teach a computer to describe an image that we give it. In order to achieve this, we will be using a **CNN-LSTM Encoder Decoder Model**. 

### Understanding the CNN-LSTM Encoder Decoder Model ###

In order to teach a computer to describe an image that we give, we need to use a fairly complex deep learning model. This deep learning model is called a CNN-LSTM Encoder Decoder Model. It consists of 2 major components that do the following:
    1. A Convolutional Neural Network (CNN) is a type of deep learning algorithm that, given an image, is able to give specific importance to certain features of the image (


In [1]:
import string
import numpy as np

from PIL import Image
import os
from pickle import dump, load
import numpy as np
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.utils.vis_utils import plot_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
# small library for seeing the progress of loops.
from tqdm.notebook import tqdm
from collections import defaultdict
import re
TOKEN_FILE = 'C:\\Users\\Andrey\\Desktop\\CS-390-Neural-Image-Processing\\Final Project\\text\\Flickr8k.token.txt'
TEXT_DATASET = 'C:\\Users\\Andrey\\Desktop\\CS-390-Neural-Image-Processing\\Final Project\\text'
IMAGE_DATASET ='C:\\Users\\Andrey\\Desktop\\CS-390-Neural-Image-Processing\\Final Project\\images'

In [28]:
# Extracting feature vector for all images

def extract_features(img_directory):
    model = InceptionV3(include_top = False, pooling='avg')
    img_files = os.listdir(img_directory)
    print(len(img_files))
    features = dict()
    for img in tqdm(img_files):
        img_file = img_directory + '/' + img # specify the exact image
        image = Image.open(img_file) # open it 
        image = image.resize((299,299)) # resize it to be an image of 299, 299
        
        image = np.expand_dims(image, axis=0) / 255 # add an extra dimension and convert pixel RGB values to decimals
        
        feature = model.predict(image)
        features[img] = feature
    
    return features
        

In [29]:
features = extract_features(IMAGE_DATASET)
dump(features, open('features.p', 'wb'))

8091


  0%|          | 0/8091 [00:00<?, ?it/s]

In [31]:
# Cleaning the text
# opens the file and returns it.
def load_doc(filename):
    with open(filename) as fp:
        text = fp.read()
        fp.close()
        return text


def all_img_captions(filename):
    image_captions = load_doc(TOKEN_FILE)
    #print(image_captions)
    image_captions = image_captions.split('\n')
    
    img_captions_dict = defaultdict(list)
    for img_caption in image_captions[:-1]:
        img_caption = img_caption.strip() # remove new line
        img, caption = img_caption.split('\t') # split on tab
        # print(img_caption)
        img = img[:-2] # remove last 2 characters in the string
        
        img_captions_dict[img].append(caption)
    
    return img_captions_dict

def clean_text(img_caption_dict):
    for img, captions in img_caption_dict.items():
        for i, caption in enumerate(captions):
            caption = caption.lower() # lower case the words
            caption = re.sub('[^A-Za-z0-9]+', ' ', caption) # removes all characters that are not a letter or number
            caption = [word for word in caption.split() if (len(word) > 1 and word.isalpha())] # only include words of length 1 or greater, and alphabetic in nature.
            captions[i] = ' '.join(caption) # convert back to string
        
        img_caption_dict[img] = captions
    
    return img_caption_dict
    
def text_vocabulary(img_captions_dict):
    vocab_list = list()
    for captions in img_captions_dict.values(): # take all captions
        for caption in captions:
            #print(caption)
            caption = caption.split() # # make string into a word list
            vocab_list.extend(caption) # add all words into vocab_list
        
    vocab_list = list(set(vocab_list)) # remove duplicates

    return vocab_list # return list

def save_captions(img_captions_dict, filename):
    with open(filename, 'a') as fp:
        lines = []
        for img, captions in img_captions_dict.items():
            for caption in captions:
                line = img + '\t' + caption + '\n'
                lines.append(line)
        lines[-1] = lines[-1].strip() # remove new line from last string
        for line in lines:
            fp.write(line)
        
        
img_captions_dict = all_img_captions(TOKEN_FILE)
img_captions_dict = clean_text(img_captions_dict)

vocabulary = text_vocabulary(img_captions_dict)
save_captions(img_captions_dict, 'img_caption.txt')

In [32]:
features = load(open('features.p', 'rb'))

In [33]:
# loading dataset for training the model
def load_images(filename): 
    file = load_doc(filename) # open file
    imgs = file.split('\n')[:-1] # convert to list and ignore last newline char
    return imgs # return

def load_clean_img_caption(filename, images):
    subset_img_caption_dict = defaultdict(list) # create a dictionary containing lists as values
    file = load_doc(filename).split('\n') # load file given to us and split on newline
    for line in file:
        
        
        line = line.split('\t') # remove tabs
        if len(line) <= 1: # remove any empty lines 
            continue
        
        img, caption = line
        if img in images: # if image is in our set
            caption = '<start> ' + caption + ' <end>' # append identifiers
            subset_img_caption_dict[img].append(caption) # append to captions list

    return subset_img_caption_dict # return

def load_subset_features(images, all_features):
    select_features = {} # dictionary
    for image in images:
        select_features[image] = all_features[image] # set value to be feature vector corresponding to image in list
    return select_features # return

filename = TEXT_DATASET +'\\Flickr_8k.trainImages.txt'
train_imgs = load_images(filename)

train_img_captions = load_clean_img_caption('img_caption.txt', train_imgs)
train_features = load_subset_features(train_imgs, features)


In [34]:
# tokenizer

def dict_to_list(img_caption_dict):
    captions_list = []
    for captions in img_caption_dict.values():
        for caption in captions:
            captions_list.append(caption) # converts our dictionary into a list containing just the captions
    
    return captions_list 


def tokenize(captions_list):
    
    tokenizer = Tokenizer() # using keras tokenizer
    tokenizer.fit_on_texts(captions_list) # update internal vocabulary based on our list of captions
    return tokenizer # return the tokenizer

def max_length(captions_list):
    
    return max(len(caption.split()) for caption in captions_list) # return the length of the longest caption


captions_list = dict_to_list(train_img_captions)
tokenizer = tokenize(captions_list)
dump(tokenizer, open('tokenizer.p', 'wb')) # dumps our tokenizer into a file called tokenizer.p
vocab_size = len(tokenizer.word_index) + 1 # computes our total vocab size. Word index is a dictionary mapping the word to the number of occurences of that word. By taking the length of the dictionary and adding 1, we get the overall vocabulary length.
print(vocab_size)
max_length = max_length(captions_list)
print(max_length)

7317
35


In [45]:
def data_generator(img_captions_dict, features, tokenizer, max_length):
    while 1:
        for img, captions in img_captions_dict.items():
            feature = features[img][0] # get corresponding extracted image feature vector
            #print(feature)
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, captions, feature) 
            yield [input_image, input_sequence], output_word # generates current batch of data

def create_sequences(captions, max_length, tokenizer, feature):
    X1, X2, y = list(), list(), list()
    
    for caption in captions:
        seq = tokenizer.texts_to_sequences([caption])[0] # takes each word in the text and replaces it with word_index frequency
        print(seq)
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i] # creates the x2 (in_seq) and y (out_seq) values. x2 value is a list that contains all words up and not including current word. y is the word to predict
            
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0] # standardizes the length of the list for each in seq. This is done in order for our model to work.
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] # converts our output array, which contains a number, to a specific categorical value. This value is dependent on the size of our vocabulary.
            
            # append to our dataset
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
        
        # print(len(X1))
    return np.array(X1), np.array(X2), np.array(y) # return


[2, 42, 3, 87, 170, 6, 116, 52, 387, 11, 394, 3, 27, 4415, 625, 1]
15
[2, 18, 313, 64, 195, 119, 1]
21
[2, 39, 18, 116, 64, 195, 2062, 1]
28
[2, 39, 18, 116, 4, 394, 19, 60, 2062, 1]
37
[2, 39, 18, 3, 87, 170, 313, 64, 195, 2901, 1]
47


((47, 2048), (47, 35), (47, 7317))

In [42]:

def image_captioning_model(vocab_size, max_length):
    imageInput = Input(shape=(2048,)) # Image Feature Vector input
    fe1 = Dropout(.5)(imageInput) # dropout layer
    fe2 = Dense(256, activation='relu')(fe1) # First Fully connected layer that uses a RELU activation Function
    
    #LSTM sequence model
    textInput = Input(shape=(max_length,)) # caption input
    se1 = Embedding(vocab_size, 256, mask_zero=True)(textInput) # convert our caption input into a word embedding of input size 7317, output size 256, input length 33
    # The mask_zero parameter is a boolean paramter that is used to mask out an input value of 0. Useful for RNN Layers that take variable input (for example in our case)
    se2 = Dropout(.5)(se1) # Dropout layer to avoid overfitting
    se3 = LSTM(256)(se2) # LSTM Layer used to learn the word Embedding and output a layer with 256 neurons
    
    # Merge both of the models
    decoder1 = add([fe2,se3]) # combines the 2 functions together
    decoder2 = Dense(256, activation='relu')(decoder1) # adds another dense layer for learning the combined function
    outputs = Dense(vocab_size, activation='softmax')(decoder2) # output layer containing 7317 outputs, determined by the probability assigned by softmax
    
    img_cap_model = Model(inputs=[imageInput, textInput], outputs=outputs) # Define the model with the inputs and outputs discussed
    img_cap_model.compile(loss='categorical_crossentropy', optimizer='adam') # compile model and give it a loss of categoriacal crossentropy and use the adam optimzer
    
    print(img_cap_model.summary()) # prints summary of how the model looks like 
    plot_model(img_cap_model, to_file='model.png', show_shapes=True) # used to obtain graphical image of the model
    
    return img_cap_model # return

In [26]:
model = image_captioning_model(vocab_size, max_length) # call model
epochs=10 # initialize number of epochs
steps=len(train_img_captions) # number of train image captions there are (6000)
os.mkdir('models') # save the models

for i in range(epochs):
    generator = data_generator(train_img_captions, train_features, tokenizer, max_length) # generate the dataset
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1) # function used to allow us to learn the model
    model.save('models/model_'+ str(i) + '.h5') # save the model

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 33, 256)      1873152     input_17[0][0]                   
__________________________________________________________________________________________________
dropout_14 (Dropout)            (None, 2048)         0           input_16[0][0]                   
____________________________________________________________________________________________



