In [9]:
# benötigte Bibliotheken importieren
import numpy as np
import pandas as pd
import os
#from glob import glob
import pickle
from tqdm.notebook import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from keras.preprocessing import image
from keras.utils import load_img, img_to_array, pad_sequences, to_categorical,plot_model
from keras.models import Model
from keras.layers import Input,Dense,LSTM,Embedding,Dropout,add
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam

from PIL import Image
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from nltk.translate.bleu_score import sentence_bleu

# Image data

In [15]:
# path to working directory
working_dir = os.getcwd()
# path to data
data_path=os.path.join(os.getcwd(), 'Data')
# path to images
image_dir = os.path.join(data_path, 'Images')

In [11]:
working_dir

'C:\\Users\\Binh_Hong_Ngoc\\Meine Daten\\Programming\\My Projects\\Image Caption Generator'

In [12]:
data_path

'C:\\Users\\Binh_Hong_Ngoc\\Meine Daten\\Programming\\My Projects\\Image Caption Generator\\Data'

We will use the pre-trained VGG16 model. It is worth noting that it ends with a Dense layer, which is responsible for classification tasks. So for feature extraction, I'm going to remove the last layer.

In [13]:
# load vgg16 model
model = VGG16()
# remove the last layer
model = Model(inputs=model.inputs,outputs=model.layers[-2].output)

In [14]:
# summary of model
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In the next step I will extract the image features from the VGG model

In [16]:
# a dictionary to save the image features
features = {}
# extract image features
for img_name in tqdm(os.listdir(image_dir)): # tqdm for visualization of the process
    img_path = image_dir + '/' + img_name
    # The default input size for this model is 224 x 224. So we need to reshape the images
    image = load_img(img_path, target_size=(224, 224))
    image = img_to_array(image)
    # Reshape image to model.    
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocessing images for VGG
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # image ID
    image_id = img_name.split('.')[0]
    # save feature
    features[image_id] = feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [23]:
# save image features in "Data" folder using pickle
pickle.dump(features, open(os.path.join(data_path, 'features.pkl'), 'wb'))

# Text data

In [24]:
# read the text file
with open(os.path.join(data_path, 'Captions\captions.txt'), 'r') as f:
    next(f) # The first line of captions.txt is "image,caption". That's why we skip this line
    captions_doc = f.read()

If we take a closer look at `captions_doc`, we can see that for each image there are corresponding 5 descriptions, which look like this:

`1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .\n1000268201_693b08cb0e.jpg,A girl going into a wooden building .\n1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .\n1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .\n1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .`

In [None]:
# dictionary with keys=image_id and values=captions
mapping = {}

for line in tqdm(captions_doc.split('\n')):
    # split by comma
    tokens = line.split(',')
    # remove all lines with less than 2 tokens, which may not have enough information
    if len(tokens) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # Remove extension from image ID
    image_id = image_id.split('.')[0]
    # 
    caption = " ".join(caption)
    # create a list of captions
    if image_id not in mapping:
        mapping[image_id] = []
    # add caption
    mapping[image_id].append(caption)

In [None]:
# first 5 examples
npic = 5
target_size = (299, 299)
path = os.path.join(data_path, 'Images/')
fig = plt.figure(figsize=(10,20))

count = 1
for img in os.listdir(path)[:npic]:
    
    filename = path + img
    captions = list(mapping[img.split(".")[0]])
    image_load = load_img(filename, target_size=target_size)
    
    ax = fig.add_subplot(npic, 2, count, xticks=[], yticks=[])
    ax.imshow(image_load)
    count += 1
    
    ax = fig.add_subplot(npic, 2, count)
    plt.axis('off')
    ax.plot()
    ax.set_xlim(0, 1)
    ax.set_ylim(0, len(captions))
    for i, caption in enumerate(captions):
        ax.text(0, i, caption, fontsize=20)
    count += 1
plt.show()