In [1]:
import tensorflow
import keras

In [2]:
from os import listdir
from keras.applications.vgg19 import VGG19
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg19 import preprocess_input
from keras.models import Model

In [None]:
def getFeatures(directory):
    
    # include_top = false -> model without dense layers, extract features only
    # I will be training the image and text features together later to fine tune
    model = VGG19(weights = 'imagenet')
    model = Model(inputs = model.input, outputs = model.get_layer('fc2').output)
    print(model.summary())
    
    features_dict = {}
    
    for file in listdir(directory):
        # remove the extension of the file name
        image_id = file.split('.')[0]
        filename = directory + '/' + file
        print(f"Loading File Name: {file}, Total Files: {count}")
        # load image 224 x 224 with 3 channels
        image = load_img(filename, target_size = (224, 224))
        # convert image to 3D numpy array and reshape to 4D
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # convert images from RGB to BGR, then each color channel is zero-centered 
        # with respect to the ImageNet dataset without scaling
        image = preprocess_input(image)
        feature = model.predict(image, verbose = 0)
        features_dict[image_id] = feature
        
    return features_dict

directory = 'Flickr8k_Dataset'
features_dict = getFeatures(directory)
print(f'Total Extracted Features: {len(features_dict)}')

In [None]:
from pickle import dump
dump(features_dict, open('features.pkl', 'wb'))

In [3]:
from pickle import load
features_dict = load(open('features.pkl', 'rb'))

In [4]:
from random import shuffle

images = list(features_dict.keys())
index_shuf = list(range(len(images)))
shuffle(index_shuf)

shuffled_images = []
for i in range(len(images)):
    shuffled_images += [images[index_shuf[i]]]

train_split = int(round(len(images) * 0.85, 0))

train_images = shuffled_images[ : train_split]
print(f"Training Set Size: {len(train_images)}")
val_images = shuffled_images[train_split : ]
print(f"Validation Set Size: {len(val_images)}")

Training Set Size: 6877
Validation Set Size: 1214


In [5]:
import string

def load_captions (file):
    
    file = open(filename, 'r')
    captions = file.read()
    file.close()
    
    captions_dict = {}
    
    for caption in captions.split('\n'):
        tokens = caption.split()
        if len(caption) < 2:
            continue
        image_id, image_caption = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_caption = ' '.join(image_caption)
        
        if image_id not in captions_dict.keys():
            captions_dict[image_id] = []
            
        captions_dict[image_id].append(image_caption)
        
    return captions_dict

def clean_captions (captions_dict):
    
    punctuation = list(string.punctuation) + [''] + [""]
    
    for image_id, captions in captions_dict.items():
        for caption in range(len(captions)):
            sentence = captions[caption]
            sentence = sentence.split()
            sentence = [word.lower() for word in sentence]
            sentence = [word for word in sentence if word not in punctuation]
            sentence = [word for word in sentence if len(word) > 1]
            sentence = [word for word in sentence if word.isalpha()]
            captions[caption] =  '<START> ' + ' '.join(sentence) + ' <END>'

def load_dataset (captions_dict, features_dict, images):
    
    selected_captions = {}
    selected_features = {}
    
    for image in images:
        selected_features[image] = features_dict[image]
        selected_captions[image] = captions_dict[image]
        
    return selected_captions, selected_features

In [6]:
filename = 'Flickr8k.token.txt'

captions_dict = load_captions(filename)
print(f"Total Images With Captions: {len(captions_dict)}")
clean_captions(captions_dict)

train_captions, train_features = load_dataset(captions_dict, features_dict, train_images)
print(f"Training Images with Captions: {len(train_captions)}")
print(f"Training Images Features: {len(train_features)}")

val_captions, val_features = load_dataset(captions_dict, features_dict, val_images)
print(f"Validation Images with Captions: {len(val_captions)}")
print(f"Validation Images Features: {len(val_features)}")

Total Images With Captions: 8092
Training Images with Captions: 6877
Training Images Features: 6877
Validation Images with Captions: 1214
Validation Images Features: 1214


In [7]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add

In [8]:
def flatten_captions (captions_dict):
    
    captions = []
    
    for image_id in captions_dict.keys():
        [captions.append(caption) for caption in captions_dict[image_id]]
        
    return captions

def create_tokenizer (captions):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    
    return tokenizer

def max_length (captions):
    
    return max(len(caption.split()) for caption in captions)

def generate_sequences (tokenizer, max_len, captions_dict, features_dict, vocab_size):
    
    x1, x2, y = [], [], []
    
    for image_id, captions in captions_dict.items():
        for caption in captions:
            sequence = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(sequence)):
                input_sequence, output_sequence = sequence[:i], sequence[i]
                input_sequence = pad_sequences([input_sequence], maxlen = max_len)[0]
                output_sequence = to_categorical([output_sequence], num_classes = vocab_size)[0]
                x1.append(features_dict[image_id][0])
                x2.append(input_sequence)
                y.append(output_sequence)
    
    return array(x1), array(x2), array(y)

In [9]:
captions_list = flatten_captions(train_captions)
tokenizer = create_tokenizer(captions_list)
vocabulary_size = len(tokenizer.word_index) + 1
print(f"Training Vocabulary Size: {vocabulary_size}")

max_len = max_length(captions_list)
print(f"Maximum Caption Length: {max_len}")

X1_train, X2_train, y_train = generate_sequences(tokenizer, max_len, train_captions, train_features, vocabulary_size)
X1_val, X2_val, y_val = generate_sequences(tokenizer, max_len, val_captions, val_features, vocabulary_size)

Training Vocabulary Size: 7780
Maximum Caption Length: 33


In [10]:
# define the captioning model
def generate_model (vocab_size, max_len):
    
    image_features = Input(shape = (4096,))
    tune_dropout = Dropout(0.5)(image_features)
    tune_layer = Dense(512, activation = 'relu')(tune_dropout)

    captions = Input(shape = (max_len,))
    seq_enc_embed = Embedding(vocab_size, 512, mask_zero = True)(captions)
    seq_enc_dropout = Dropout(0.5)(seq_enc_embed)
    seq_enc_layer = LSTM(512)(seq_enc_dropout)

    dec_combine = add([tune_layer, seq_enc_layer])
    dec_layer = Dense(512, activation = 'relu')(dec_combine)
    
    output_caption = Dense(vocab_size, activation = 'softmax')(dec_layer)
    
    model = Model(inputs = [image_features, captions], outputs = output_caption)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    print(model.summary())
    
    return model

In [11]:
# define the model
model = generate_model(vocabulary_size, max_len)
history = model.fit([X1_train, X2_train], y_train, epochs = 5, verbose=1, validation_data=([X1_val, X2_val], y_val))

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 512)      3983360     input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           input_1[0][0]                    
______________________________________________________________________________________________

KeyboardInterrupt: 

In [None]:
print(history.history['loss'])
print(history.history['val_loss'])

In [None]:
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [None]:
# map an integer to a word
def word_for_id (integer, tokenizer):
    
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
        
    return '<UNK>'

# generate a description for an image
def generate_caption (model, tokenizer, image, max_len):
    
    input_text = '<START>'
    
    for i in range(max_len):
        sequence = tokenizer.texts_to_sequences([input_text])[0]
        sequence = pad_sequences([sequence], maxlen = max_len)
        predicted = model.predict([image, sequence], verbose=0)
        predicted = argmax(predicted)
        word = word_for_id(predicted, tokenizer)
        
        if word == 'end':
            input_text += ' ' + word
            break
        else:
            input_text += ' ' + word
            
    return input_text

# evaluate the skill of the model
def evaluate_model (model, tokenizer, max_len, captions_dict, features_dict):
    
    actual, predicted = [], []
    
    for image_id, captions in captions_dict.items():
        yhat = generate_caption(model, tokenizer, features_dict[image_id], max_len)
        actual.append([caption.split() for caption in captions])
        predicted.append(yhat.split())
        
    print(f"BLEU-1: {corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))}")
    print(f"BLEU-2: {corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))}")
    print(f"BLEU-3: {corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))}")
    print(f"BLEU-4: {corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))}")
    
    return actual, predicted

actual_captions, predicted_captions = evaluate_model(model, tokenizer, max_len, val_captions, val_features)