#Image Caption Generator 
###Author: Atta Ali 
<br>

####Approach: 
- Dataset Used: Flickr 8k 
- Encoder - Decoder Model 
- Extract features from each image using Xception (a pre-trained CNN model) 
- CNN for Image Processing
- RNN/LSTM for Text Processing 



In [19]:
import numpy as np
from PIL import Image
import os
import string
from pickle import dump
from pickle import load
from keras.applications.xception import Xception 
from keras.applications.xception import preprocess_input
import keras.preprocessing
from tensorflow.keras.utils import load_img
from tensorflow.keras.utils import img_to_array
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.layers import LSTM, Embedding, Dropout
from tqdm import tqdm_notebook as tqdm #to check loop progress
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# Load the document file into memory
def load_fp(filename):
  # Open file to read
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

# get all images with their captions
def img_capt(filename):
  data = open(filename, 'r')
  file = data.read() 
  captions = file.split('\n')
  descriptions ={}
  for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
  return descriptions


#Data cleaning function will convert all upper case alphabets to lowercase
#removing punctuations and words containing numbers
def txt_clean(captions):
  table = str.maketrans('','',string.punctuation)
  for img,caps in captions.items():
    for i,img_caption in enumerate(caps):
          img_caption.replace("-"," ")
          descp = img_caption.split()
          #uppercase to lowercase
          descp = [wrd.lower() for wrd in descp]
          #remove punctuation from each token
          descp = [wrd.translate(table) for wrd in descp]
          #remove hanging 's and a
          descp = [wrd for wrd in descp if(len(wrd)>1)]
          #remove words containing numbers with them
          descp = [wrd for wrd in descp if(wrd.isalpha())]
          #converting back to string
          img_caption = ' '.join(descp)
          captions[img][i]= img_caption
  return captions

# To build vocab of all unique words
def txt_vocab(descriptions):
  vocab = set()
  for key in descriptions.keys():
      [vocab.update(d.split()) for d in descriptions[key]]
  return vocab

#To save all descriptions in one file
def save_descriptions(descriptions, filename):
  lines = list()
  for key, desc_list in descriptions.items():
    for desc in desc_list:
          lines.append(key + 't' + desc )
    data = "\n".join(lines)
  file = open(filename,"w")
  file.write(data)
  file.close()

dataset_text = "/content/drive/MyDrive/Flickr8k_text"
dataset_images = "/content/drive/MyDrive/Flicker8k_Dataset"
filename = dataset_text + "/" + "Flickr8k.token.txt"

descriptions = img_capt(filename)
print("Length of descriptions =" ,len(descriptions))

clean_descriptions = txt_clean(descriptions)
vocabulary = txt_vocab(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

save_descriptions(clean_descriptions, "descriptions.txt")

Length of descriptions = 8092
Length of vocabulary =  8763


In [25]:
def extract_features(dir):
       model = Xception(include_top=False, pooling='avg')
       features = {}
       for pic in tqdm(os.listdir(dir)):
           file = dir + "/" + pic
           image = Image.open(file)
           image = image.resize((299,299))
           image = np.expand_dims(image, axis=0)
           image = image/127.5
           image = image - 1.0
           feature = model.predict(image)
           features[pic] = feature
       return features

#2048 feature vector
features = extract_features(dataset_images)
dump(features, open("features.p","wb"))
#to directly load the features from the pickle file.
features = load(open("features.p","rb"))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for pic in tqdm(os.listdir(dir)):


  0%|          | 0/8091 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [80]:
#load the data
def load_photos(filename):
  data = open(filename, 'r')
  file = data.read() 
  photos = file.split("\n")[:-1]
  return photos
   
def load_clean_descriptions(filename, photos):
  #loading clean_descriptions
   data = open(filename, 'r')
   file = data.read() 
   trained_descriptions = {} 
   for line in file.split("\n"):
       words = line.split('t', 1)
       if len(words)<1 :
           continue
       image, image_caption = words[0], words[1].split()
       if image in photos:
          if image not in trained_descriptions:
               trained_descriptions[image] = []
          desc = ' ' + " ".join(image_caption) + ' '
          trained_descriptions[image].append(desc)
   return trained_descriptions

def load_features(photos):
   all_features = load(open("features.p","rb"))
   features = {k:all_features[k] for k in photos}
   return features

filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [81]:
#print(train_imgs['1022454332_6af2c1449a.jpg'])
if '1022454332_6af2c1449a.jpg' in train_imgs: 
  print('yes')
else: 
  print('not in trained images')

if '1022454332_6af2c1449a.jpg' in train_descriptions:
  print('yes')  
else: print('no')

print(len(train_imgs))
print(len(train_descriptions))

not in trained images
no
6000
6000


In [28]:
from keras.preprocessing.text import Tokenizer

#convert dictionary to list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
       [all_desc.append(d) for d in descriptions[key]]
    return all_desc

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

max_length = max_length(descriptions)
print(max_length) 

8764
32


In [29]:
def data_generator(descriptions, features, tokenizer, max_length):
  while 1:
    for key, description_list in descriptions.items(): 
              #retrieve photo features
              feature = features[key][0]
              inp_image, inp_seq, op_word = create_sequences(tokenizer, max_length, description_list, feature)
              yield [[inp_image, inp_seq], op_word]

def create_sequences(tokenizer, max_length, desc_list, feature):
   x_1, x_2, y = list(), list(), list()
   # move through each description for the image
   for desc in desc_list:
      # encode the sequence
       seq = tokenizer.texts_to_sequences([desc])[0]
      # divide one sequence into various X,y pairs
       for i in range(1, len(seq)):
          # divide into input and output pair
           in_seq, out_seq = seq[:i], seq[i]
          # pad input sequence
           in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          # encode output sequence
           out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
          # store
           x_1.append(feature)
           x_2.append(in_seq)
           y.append(out_seq)
   return np.array(x_1), np.array(x_2), np.array(y)

#Checking the shape of x1, x2 and y
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape
#((47, 2048), (47, 32), (47, 7577))

((37, 2048), (37, 32), (37, 8764))

In [47]:
from keras.layers import *
from numpy.core.multiarray import concatenate
from keras.utils import plot_model

# define the captioning model
def define_model(vocab_size, max_length):
  # features from the CNN model compressed from 2048 to 256 nodes
   inputs1 = Input(shape=(2048,))
   fe1 = Dropout(0.5)(inputs1)
   fe2 = Dense(256, activation='relu')(fe1)

  # LSTM sequence model
   inputs2 = Input(shape=(max_length,))
   se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
   se2 = Dropout(0.5)(se1)
   se3 = LSTM(256)(se2)

  # Merging both models
   decoder1 = Add()([fe2, se3])
   decoder2 = Dense(256, activation='relu')(decoder1)
   outputs = Dense(vocab_size, activation='softmax')(decoder2)

   # merge it [image, seq] [word]
   model = Model(inputs=[inputs1, inputs2], outputs=outputs)
   model.compile(loss='categorical_crossentropy', optimizer='adam')

   # summarize model
   print(model.summary())
   plot_model(model, to_file='model.png', show_shapes=True)
   return model

In [None]:
# train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)
# creating a directory named models to save our models
os.mkdir("models")
for i in range(epochs):
   generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
   model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)
   model.save("models/model_" + str(i) + ".h5")

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 8764
Description Length:  32
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 32)]         0           []                               
                                                                                                  
 input_30 (InputLayer)          [(None, 2048)]       0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 32, 256)      2243584     ['input_31[0][0]']               
                                                                                                  
 dropout_22 (Dropout)           (None, 2048)         0           ['input_30[0][0]'] 

In [26]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('-i', '--image', required=True, help="Image Path")
args = vars(ap.parse_args())
img_path = args['image']

def extract_features(filename, model):
  try:
           image = Image.open(filename)
  except:
       print("ERROR: Can't open image! Ensure that image path and extension is correct")
  image = image.resize((299,299))
  image = np.array(image)
  # for 4 channels images, we need to convert them into 3 channels
  if image.shape[2] == 4:
      image = image[..., :3]
  image = np.expand_dims(image, axis=0)
  image = image/127.5
  image = image - 1.0
  feature = model.predict(image)
  return feature

def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
   in_text = 'start'
   for i in range(max_length):
       sequence = tokenizer.texts_to_sequences([in_text])[0]
       sequence = pad_sequences([sequence], maxlen=max_length)
       pred = model.predict([photo,sequence], verbose=0)
       pred = np.argmax(pred)
       word = word_for_id(pred, tokenizer)
       if word is None:
           break
       in_text += ' ' + word
       if word == 'end':
           break
   return in_text

max_length = 32
tokenizer = load(open("tokenizer.p","rb"))
model = load_model('models/model_9.h5')
xception_model = Xception(include_top=False, pooling="avg")
photo = extract_features(img_path, xception_model)
img = Image.open(img_path)
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)

usage: ipykernel_launcher.py [-h] -i IMAGE
ipykernel_launcher.py: error: the following arguments are required: -i/--image


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
