In [1]:
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip /content
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip /content

--2020-07-16 11:49:28--  https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/124585957/47f52b80-3501-11e9-8f49-4515a2a3339b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200716%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200716T114928Z&X-Amz-Expires=300&X-Amz-Signature=06dfbd4a4891e4ef6a7e575c90ec6cb5d4b1bfc3a3a06b9f5fc52c54f1726de8&X-Amz-SignedHeaders=host&actor_id=0&repo_id=124585957&response-content-disposition=attachment%3B%20filename%3DFlickr8k_Dataset.zip&response-content-type=application%2Foctet-stream [following]
--2020-07-16 11:49:28--  https://github-production-release-asset-2e65be.s3.amazonaws.com/124585957/47f52b80-3501-11e9-8f49-4515a2a3339b?X-Amz-Algorithm=AWS4-HMAC-SHA

In [2]:
from os import listdir
from pickle import dump
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
import numpy as np

In [3]:
#Unzip files
import zipfile
with zipfile.ZipFile('/content/Flickr8k_Dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

with zipfile.ZipFile('/content/Flickr8k_text.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/Flickr8k_text')

In [4]:
def extract_features(directory):
  #Load the VGG16 Model. Remove the last classification layer
  model = VGG16(include_top=False)
  #Take in input format as the VGG16 input format and the output to be the output of the last VGG16 layer
  model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
  features = {}
  #Names of all the .jpg files in the directory
  for name in listdir(directory):
    #Get the full path of the .jpg image
    filename = directory + '/' + name
    #Load the image, resize to the VGG input shape
    image = load_img(filename,target_size=(224,224))
    image = img_to_array(image)
    #Transform from 3D to 4D array
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    feature = model.predict(image,verbose=0)
    image_id = name.split('.')[0]
    features[image_id] = feature
    print(f'{name} file initialized...')
  return features

In [None]:
directory = '/content/Flicker8k_Dataset'
features = extract_features(directory)
print(f'Extracted {len(features)} features')
dump(features,open('features.pki','wb'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3677329561_fa3e1fdcf9.jpg file initialized...
2077079696_03380d218b.jpg file initialized...
3201666946_04fe837aff.jpg file initialized...
465994762_1760e83c5d.jpg file initialized...
3659686168_49c3abcee1.jpg file initialized...
145721496_687af9bb18.jpg file initialized...
3582914905_f58db879ae.jpg file initialized...
228949397_9e63bfa775.jpg file initialized...
349889354_4b2889a9bd.jpg file initialized...
534313000_4ad39c7ee0.jpg file initialized...
3666169738_a8c74cf745.jpg file initialized...
1525153022_06c48dbe52.jpg file initialized...
2952320230_26601173be.jpg file initialized...
2855417531_521bf47b50.jpg file initialized...
2367317953_503317493e.jpg file initialized...
473220329_819a913bbb.jpg file initialized...
3481859121_3d3e566ec0.jpg file initialized...
3270691950_88583c3524.jpg file initialized...
3439414478_8038ba9409.jpg file initialized...
3425127583_611200619a.jpg file initialized...
2938120171_970564e3d8

In [5]:
#Extract the text from files

def load_doc(filename):
  file = open(filename,'r')
  text = file.read()
  file.close()
  return text

filename = '/content/Flickr8k_text/Flickr8k.token.txt'

doc = load_doc(filename)

In [6]:
print(doc[:200])

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A lit


In [7]:
#This will read the text from the text file and return a dictionary with the image_id and it's corresponding descriptions (About 5 descriptions for one image)

def load_descriptions(doc):
  mapping = {}
  #Gets the line by line for the whole .txt file
  for line in doc.split('\n'):
    #retuns a list where the elements are the individual words in the line
    tokens = line.split()
    if len(line) < 2:
      continue
    image_id, image_desc = tokens[0], tokens[1:]
    #Just get the image_id without the ,jpg extention behind
    image_id = image_id.split('.')[0]
    #Combine all the individual elements in the list to form a single sentence
    image_desc = ' '.join(image_desc)
    #There will be some image_ids that are the same (Becuase there are multiple descriptions for the same image, hence you want to append them to the list of that image_id (Which will be same for the descriptions describing the same image))
    if image_id not in mapping:
      mapping[image_id] = []
    mapping[image_id].append(image_desc)
  return mapping

In [8]:
descriptions = load_descriptions(doc)
print(f'Loaded {len(descriptions)} descriptions!')
print(descriptions)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
import string

#For every single description, remove all numbers, punctuation and words with length = 1. Convert all to lower case

def clean_descriptions(descriptions):
  #string.punctuation is a list of all the possible punctuations
  #Only need to specify the 3rd parameter, which is basically the list of characters which must be replaced (You want to remove all punctuations)
  #You apply this 'table' transformation using the translate method
  table = str.maketrans('','',string.punctuation)
  #Get the individual key:value pairs in the dictionary
  for id, desc_list in descriptions.items():
    for i in range(len(desc_list)):
      #Get the individual description in the list of descriptions
      desc = desc_list[i]
      #Get the individual words in the description
      desc = desc.split()
      #Convert all words to lower case
      desc = [word.lower() for word in desc]
      #Remove all punctuation in the token
      desc = [w.translate(table) for w in desc]
      #Only return the words which are not a single character
      desc = [word for word in desc if len(word)> 1]
      #Only return the words which are not a single character
      desc = [word for word in desc if word.isalpha()]
      #Combine all the elements in the list to form a sentence
      desc_list[i] = ' '.join(desc)

clean_descriptions(descriptions)

In [None]:
# return all the unique words in all the descriptions

def to_vocab(descriptions):
  #A set cannot have duplicate values. Hence, all elements in the set contains unique values
  all_desc = set()
  #access the ids in the descriptions dictionary
  for key in descriptions.keys():
    #for d in descriptions[key] will access all the individual description
    #d.split() will extract all the individual words in the description
    #Since sets will contain only unique elements, all_desc() set will contain all the unique words in all the description

    [all_desc.update(d.split()) for d in descriptions[key]]
  return all_desc

vocab = to_vocab(descriptions)
print(f'Vocab size is {len(vocab)}')

Vocab size is 8763


In [None]:
print(vocab)



In [None]:
def save_descriptions(descriptions, filename):
  lines = []
  for key, desc_list in descriptions.items():
    for desc in desc_list:
      lines.append(key +' ' + desc)
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

save_descriptions(descriptions,'/content/descriptions.txt')

In [None]:
def load_set(filename):
  doc = load_doc(filename)
  dataset = []
  for line in doc.split('\n'):
    #If it's an empty line, skip
    if len(line) < 1:
      continue
    identifier = line.split('.')[0]
    dataset.append(identifier)
  return set(dataset)

In [None]:
# Return a dictionary that maps the image_id to it's descriptions

def load_clean_descriptions(filename,dataset):
  #load the document
  doc = load_doc(filename)
  descriptions = {}
  for line in doc.split('\n'):
    #Get the individual words in the sentence
    tokens = line.split()
    #Separate the image id and its description
    image_id , image_desc = tokens[0], tokens[1:]
    if image_id in dataset:
      if image_id not in descriptions:
        descriptions[image_id] = []
      desc = 'startseq' + ' '.join(image_desc) + ' endseq'
      descriptions[image_id].append(desc)
  return descriptions

In [None]:
def load_photo_features(filename, dataset):
  all_features = load(open(filename, 'rb'))
  features = {k:all_features[k] for k in dataset}
  return features

In [None]:
from pickle import load

filename = '/content/Flickr8k_text/Flickr_8k.trainImages.txt'
#This contains all the training image_ids (Without the descriptions)
train = load_set(filename)
print(f'Loaded {len(train)} features')
#Descriptions.txt is a file containing all the pic_ids and their corresponding descriptions
train_descriptions = load_clean_descriptions('descriptions.txt',train)
print(f'Loaded {len(train_descriptions)} training descriptions')
#features.pki is the features (Of all the images) extracted by the VGG16 model 
train_features = load_photo_features('features.pki',train)
print(f'Loaded {len(train_features)} training features')

Loaded 6000 features
Loaded 6000 training descriptions
Loaded 6000 training features


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import add
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
def to_lines(descriptions):
  all_desc = []
  for key in descriptions.keys():
    [all_desc.append(d) for d in descriptions[key]]
  return all_desc

In [None]:
def create_tokenizer(descriptions):
  lines = to_lines(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [None]:
#Create the arrays of x1, x2, and y given the descriptions and photos

def create_sequences(tokenizer, max_len, descriptions, photos, vocab_size):
  x1, x2, y = [], [], []
  for key, desc_list in descriptions.items():
    for desc in desc_list:
      # Encode the descriptions
      seq = tokenizer.texts_to_sequences([desc])[0]
      # Loop through the sequence (the line of description) to generate the x2 and y values (where x2 is the current sentence and y is the next word of that sentence which is to be predicted)
      for i in range(1,len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq],maxlen=max_len)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        x1.append(photos[key][0])
        x2.append(in_seq)
        y.append(out_seq)
  return np.array(x1), np.array(x2), np.array(y)

In [None]:
def max_length(descriptions):
  #Get the lines of descriptions
  lines = to_lines(descriptions)
  #Loop through all the lines to find the max length among all the lines
  return max(len(d.split()) for d in lines)

In [None]:
def define_model(vocab_size,max_length):

  #Create the feature extractor model (This is the model which deals with the images)
  #Define the input to the Photo Feature Extractor (This is the extracted features using the VGG16 model, which has shape 4096)
  #This model will use a 4096 element representation of an image(Taken from the model) and turn it to a 256 element representation
  inputs1 = Input(shape=(4096,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activations='relu')(fe1)

  #Create the sequence processor model (This is the model which deals with the word descriptions)
  inputs2 = Input(shape=(max_length,))
  se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)

  #Create the decoder model
  decoder1 = add([fe2, se3])
  decoder2 = Dense(256, activation='relu')(decoder1)
  #Since there are vocab_size possibilities (there are vocab_size words that can be predicted)
  outputs = Dense(vocab_size,activation='softmax')(decoder2)

  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy',optimizer='adam')

  print(model.summary())
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [None]:
#Train dataset

#Load training dataset

filename = '/content/Flickr8k_text/Flickr_8k.trainImages.txt'
#This contains all the training image_ids (Without the descriptions)
train = load_set(filename)
print(f'Loaded {len(train)} features')
#Descriptions.txt is a file containing all the pic_ids and their corresponding descriptions
train_descriptions = load_clean_descriptions('descriptions.txt',train)
print(f'Loaded {len(train_descriptions)} training descriptions')
#features.pki is the features (Of all the images) extracted by the VGG16 model 
train_features = load_photo_features('features.pki',train)
print(f'Loaded {len(train_features)} training features')


#Prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print(f'Vocabulary size: {vocab_size}')
max_len = max_length(train_descriptions)
print(f'Max description length is {max_len}')
x1_train , x2_train, y_train = create_sequences(tokenizer, max_len, train_descriptions,train_features, vocab_size)


#Load dev Dataset

filename = '/content/Flickr8k_text/Flickr_8k.devImages.txt'
test = load_set(filename)
test_descriptions = load_clean_descriptions('descriptions.txt', test)
test_features = load_photo_features('features.pkl', test)
x1_test, x2_test, y_test = create_sequences(tokenizer, max_len, test_descriptions, test_features, vocab_size)


#Define the model
model = define_model(vocab_size, max_len)

#Define checkpoint callback
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
#You are looking at the val_loss and since you want to save_best_only, it will save the model that 'minimizes' this loss
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

Loaded 6000 features
Loaded 6000 training descriptions
Loaded 6000 training features
Vocabulary size: 8306
Max description length is 33


In [None]:
#Fit model
model.fit([x1_train, x2_train], y_train, epochs=20, verbose=1, callbacks=[checkpoint],validation_data=([x1_test, x2_test], y_test))