In [1]:
import numpy as np
import cv2 
import os
from glob import glob
import matplotlib.pyplot as plt

from keras.applications import ResNet50
from keras.models import Model
import copy

In [2]:
images_path = 'Flickr_Data/Images/'
images = glob(images_path+'*.jpg')
captions_path = "Flickr_Data/Flickr_TextData/FLickr8k.token.txt"

In [3]:
resnet_model = ResNet50(include_top=True)

In [4]:
# restructure model
resnet_model = Model(inputs=resnet_model.input, outputs=resnet_model.layers[-2].output)

In [None]:
# preprocess the images
img_feature_vectors = {}

count = 0
for item in images:
    img = cv2.imread(item)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = img.reshape(1, 224, 224, 3)
    
    feature_vector = resnet_model.predict(img, verbose=0).reshape(2048,)
    
    img_filename = item.split('\\')[-1]
    img_feature_vectors[img_filename] = feature_vector
    
    count += 1
    
    if (count%50==0):
        print(count)
    
    if (count==1499):
        break

In [None]:
np.save('image_feature_vectors.npy', img_feature_vectors)

In [5]:
# load in dictionary of image feature vectors
img_feature_vectors = np.load('image_feature_vectors.npy', allow_pickle='TRUE').item()

In [6]:
# makes a string lowercase, prepends it with the string 'sos' and appends with 'eos'
def process_string(s):
    s = s.lower()
    return 'sos ' + s + ' eos'

In [7]:
# preprocess the captions data
captions_dict = {}

f = open(captions_path, 'r').read().split('\n')

# generate a dictionary of filenames to a list of captions
for line in f:
    try:
        filename_caption = line.split('\t') 
        filename = filename_caption[0][:-2]
        caption = process_string(filename_caption[1])
        
        if filename in img_feature_vectors:
            if filename not in captions_dict:
                captions_dict[filename] = [caption]
            else:
                captions_dict[filename].append(caption)
    except:
        pass

In [8]:
# generate a dictionary called vocab which will contain all words in the set of captions mapped to a unique integer
vocab = {}

count = 1
for filename, captions in captions_dict.items():
    for caption in captions:
        caption_as_list = caption.split()
        for word in caption_as_list:
            if word not in vocab:
                vocab[word] = count
                count += 1

In [9]:
# takes a string and returns a list of integers where each integer corresponds to a particular word
def encode_string(s, adict):
    s_list = s.split()
    encoded_string = []
    for word in s_list:
        encoded_string.append(adict[word])
    return encoded_string

In [25]:
# generates a copy of captions_dict where each caption is replaced with a list of integers where each integer corresponds to a word in the caption
captions_dict_encoded = copy.deepcopy(captions_dict)

for filename, captions in captions_dict_encoded.items():
    for i, caption in enumerate(captions):
        captions[i] = encode_string(caption, vocab)

In [26]:
# get length of the longest caption in the data set
max_len = 0
for filename, captions in captions_dict_encoded.items():
    for caption in captions:
        if len(caption)>max_len:
            max_len = len(caption)

In [27]:
captions_dict_encoded = np.load('captions_dict_encoded.npy', allow_pickle='TRUE').item()

In [28]:
def generate_training_data(image_feature_vectors, captions_dict_encoded):
    X = []
    y_in = []
    y_out = []
    
    for filename, captions in captions_dict_encoded.items():
        for caption in captions:
            i = 0
            for word in caption:
                y_in_item = caption[:i]
                y_in_item = (y_in_item + max_len * [0])[:max_len]
                y_in.append(y_in_item)
                
                y_out_item = [0]*len(vocab)
                y_out_item[word-1] = 1
                y_out.append(y_out_item)
                
                X_item = image_feature_vectors[filename]
                X.append(X_item)
                i+=1
    return X, y_in, y_out

In [29]:
X, y_in, y_out = generate_training_data(img_feature_vectors, captions_dict_encoded)

In [30]:
del X
del y_in
del y_out