<a href="https://colab.research.google.com/github/CocoTheAussieCat/dl_at3/blob/master/Colab_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Google Drive Data Source

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [0]:
# Direct the workbook into the project folder

%cd drive/Shared\ drives/DL_AT3/Cloud_Folder

/content/drive/Shared drives/DL_AT3/Cloud_Folder


In [0]:
import os 
dataset_dir = os.getcwd() 

In [0]:
#Check the directory
dataset_dir

'/content/drive/Shared drives/DL_AT3/Cloud_Folder'

In [0]:
#
!pwd

/content/drive/Shared drives/DL_AT3/Cloud_Folder


In [0]:
#Check the files inside the directory
!ls

Colab_Model.ipynb	    Flickr8k_Dataset.zip      Flickr_8k.trainImages.txt
CrowdFlowerAnnotations.txt  Flickr_8k.devImages.txt   __MACOSX
descriptions.txt	    Flickr8k.lemma.token.txt  model.png
ExpertAnnotations.txt	    Flickr_8k.testImages.txt  readme.txt
features.pkl		    Flickr8k_text.zip	      saved_model
Flicker8k_Dataset	    Flickr8k.token.txt


## Unzip File from Google Colab

In [0]:
#Unzip the Image Dataset
#!unzip Flickr8k_Dataset.zip

In [0]:
#Unzip the Text Dataset
#!unzip Flickr8k_text.zip

# Model

# Setup

Import libraries, set working directory and relative paths

In [0]:
import pandas as pd
import numpy as np
import array as arr
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
from pickle import dump
import string
import os
import time

import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input, Dense, Flatten, LSTM, Embedding, Dropout, Add

from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
# Set seeds for numpy and tensorflow
tf.random.set_seed(12)
np.random.seed(12)

In [0]:
# Check the folder's content after unziping 
!ls

Colab_Model.ipynb	    Flickr8k_Dataset.zip      Flickr_8k.trainImages.txt
CrowdFlowerAnnotations.txt  Flickr_8k.devImages.txt   __MACOSX
descriptions.txt	    Flickr8k.lemma.token.txt  model.png
ExpertAnnotations.txt	    Flickr_8k.testImages.txt  readme.txt
features.pkl		    Flickr8k_text.zip	      saved_model
Flicker8k_Dataset	    Flickr8k.token.txt


In [0]:
# Set the dataset directory and relative directories
image_dir = dataset_dir + '/Flicker8k_Dataset'
caption_dir = dataset_dir + '/Flickr8k.token.txt'
train_dir = dataset_dir + '/Flickr_8k.trainImages.txt'
test_dir = dataset_dir + '/Flickr_8k.testImages.txt'
val_dir = dataset_dir + '/Flickr_8k.devImages.txt'


# Prepare image data

Extract image features

Code source: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/

Used VGG16 pre-trained model to extract image features by:

Loading VGG16 pre-trained model.
Removing top layer (because this layer is used for classification, which is not what is required)
Extract features from each image by using predict function of VGG16 model.
Create image_id by extracting the characters before .jpg in the file name.
Store these features as vector of length 4096 in dictionary with image_id as key.

# extract features from each photo in the directory
# Use VGG16 model, without top layer, add flatten and dense layer to get output of 4096
# which is the required shape for LSTM model
def extract_features(directory):
	# load the model
	model = VGG16(input_shape=(224, 224, 3), weights='imagenet', include_top=False)

	model_new = tf.keras.Sequential([
  	model,
  	Flatten(),
	Dense(4096)])
	# remove top layer from model
	# model.layers.pop()
	# model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
	# print summary
	print(model_new.summary())
	# extract features from each photo
	features = dict() # create empty dictionary to store features in
	for name in os.listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare image for VGG model
		image = preprocess_input(image)
		# get features
		feature = model_new.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature in dictionary using image_id as key
		features[image_id] = feature
		print('>%s' % name)
	return features

In [0]:
### ONLY RUN IF YOU DON'T HAVE features.pkl IN YOUR ENVIRONMENT
### TAKES >  1HOUR TO RUN
# Extract features from all images
#features = extract_features(image_dir)
#print('Extracted Features: %d' % len(features))

In [0]:
# Save feature as pickle file
#dump(features, open('features.pkl', 'wb'))

# Prepare text data

Code source: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/

Get cleaned caption for each image by:

Loading captions from text file.
Creating dictionary of captions using image_id as key.
Clean all captions by removing digits, single letter words (eg: a), punctuation and converting to lower case

In [0]:
# Load and read image description file
def load_doc(filename):
	"""
	Reads all captions from txt file as single string
	Inputs		- filename = filename of .txt with image captions
	Outputs		- text = string
	"""
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [0]:
# Extract descriptions for images
def load_descriptions(doc):
	"""
    Inputs      - doc = string, output from load_doc()
    Outputs     - mapping = dictionary-list of image_id and captions 
    """
	caption_dict = {}
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in caption_dict:
			caption_dict[image_id] = list()
			# store description
			caption_dict [image_id].append(image_desc)
	return caption_dict 

In [0]:
def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

In [0]:
# Convert descriptions into vocabulary of words
def to_vocabulary(descriptions):
	# build list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

In [0]:
# Save descriptions to file, one image_id and description per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [0]:
# Load descriptions from tokenised text file
doc = load_doc(caption_dir)

# Create dictionary of image_id and descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

# Clean descriptions by stripping digits, punctuation, single letter words and converting to lowercase
clean_descriptions(descriptions)

# Create vocab from descriptions and get vocab length
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

# Save descriptions to file, one image_id and description per line
save_descriptions(descriptions, 'descriptions.txt')

Loaded: 8092 
Vocabulary Size: 4473


# Load pre-processed training data for modelling

In [0]:
def load_doc(filename):
	# open file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close file
	file.close()
	return text

In [0]:
# Load pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

In [0]:
# Load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

In [0]:
# Load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = pickle.load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

## Tokenise descriptions
Map unique words to integers using tf.keras tokenizer

In [0]:
# Convert dictionary of clean descriptions to list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

In [0]:
# Fit tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = tf.keras.preprocessing.text.Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [0]:
# Create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each image identifier
	for key, desc_list in descriptions.items():
		# walk through each description for the image
		for desc in desc_list:
			# encode the sequence
			seq = tokenizer.texts_to_sequences([desc])[0]
			# split one sequence into multiple X,y pairs
			for i in range(1, len(seq)):
				# split into input and output pair
				in_seq, out_seq = seq[:i], seq[i]
				# pad input sequence
				in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
				# encode output sequence
				out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
				# store
				X1.append(photos[key][0])
				X2.append(in_seq)
				y.append(out_seq)
	return np.array(X1), np.array(X2), np.array(y)

In [0]:
# Helper function to calculate length of description with most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

## Load train and validation data for modelling
Images loaded as numpy arrays, descriptions tokenised

In [0]:
# Load training set
train = load_set(train_dir)
print('Dataset: %d' % len(train))

# Load training set descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train = %d' % len(train_descriptions))

# Extract training set image features from features.pkl
train_features = load_photo_features('features.pkl', train)
print('Photos: train = %d' % len(train_features))

# Prepare sequences of descriptions for train, test and validation sets
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# Determine max sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Dataset: 6000
Descriptions: train = 6000
Photos: train = 6000
Vocabulary Size: 3848
Description Length: 30


In [0]:
# Create data for modelling
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)

In [0]:
# Load validation set (using devImages)
val = load_set(val_dir)
print('Dataset: %d' % len(val))

# Load training set descriptions
val_descriptions = load_clean_descriptions('descriptions.txt', val)
print('Descriptions: val = %d' % len(val_descriptions))

# Extract training set image features from features.pkl
val_features = load_photo_features('features.pkl', val)
print('Photos: val = %d' % len(val_features))

Dataset: 1000
Descriptions: val = 1000
Photos: val = 1000


In [0]:
# Create data for modelling
X1val, X2val, yval = create_sequences(tokenizer, max_length, val_descriptions, val_features, vocab_size)

# Define model
Based on merge-model described by Tanti et al. in *Where to put the Image in an Image Caption Generator*

source: <https://arxiv.org/abs/1703.09137>

code source: https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/

In [0]:
def define_model(vocab_size, max_length):
	# feature extractor model
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence model
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	se3 = LSTM(256)(se2)
	# decoder model
	decoder1 = Add()([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	# summary
	print(model.summary())
	plot_model(model, to_file='model.png', show_shapes=True)
	return model

# Train model
Use checkpoint callbacks to save training informatoin

In [0]:
# Define where to save checkpoints
check_dir = dataset_dir + '/checkpoints/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'

# Monitor validation loss, saving only the best
checkpoint = ModelCheckpoint(check_dir, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [34]:
# Create base model
base_model = define_model(vocab_size, max_length)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 256)      985088      input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           input_1[0][0]                    
______________________________________________________________________________________________

In [0]:
### DON'T RUN THIS LOCALLY ON YOUR LAPTOP! PROBABLY WON'T HAVE ENOUGH RAM
# Fit model
tic = time.perf_counter()
base_history = base_model.fit([X1train, X2train], ytrain, epochs=20, verbose=2, validation_data=([X1val, X2val], yval))
toc = time.perf_counter()
print(f”Model Runned in {toc - tic:0.4f} seconds”)

Epoch 1/20
1916/1916 - 219s - loss: 5.3237 - val_loss: 5.0856
Epoch 2/20
1916/1916 - 218s - loss: 5.1318 - val_loss: 4.9999
Epoch 3/20
1916/1916 - 220s - loss: 5.0254 - val_loss: 4.9403
Epoch 4/20
1916/1916 - 222s - loss: 4.9443 - val_loss: 4.9121
Epoch 5/20
1916/1916 - 221s - loss: 4.8810 - val_loss: 4.8994
Epoch 6/20
1916/1916 - 221s - loss: 4.8105 - val_loss: 4.8616
Epoch 7/20
1916/1916 - 220s - loss: 4.7540 - val_loss: 4.8653
Epoch 8/20
1916/1916 - 219s - loss: 4.7138 - val_loss: 4.8367
Epoch 9/20
1916/1916 - 220s - loss: 4.6595 - val_loss: 4.7724
Epoch 10/20
1916/1916 - 221s - loss: 4.5222 - val_loss: 4.6942
Epoch 11/20
1916/1916 - 218s - loss: 4.4269 - val_loss: 4.6249
Epoch 12/20
1916/1916 - 217s - loss: 4.2606 - val_loss: 4.5801
Epoch 13/20
1916/1916 - 215s - loss: 4.2520 - val_loss: 4.5730
Epoch 14/20
1916/1916 - 214s - loss: 4.2493 - val_loss: 4.5790
Epoch 15/20
1916/1916 - 216s - loss: 4.1475 - val_loss: 4.6136
Epoch 16/20
1916/1916 - 220s - loss: 3.9772 - val_loss: 4.6364
E

In [0]:
check_dir

'/content/drive/Shared drives/DL_AT3/Cloud_Folder/checkpoints/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'

In [0]:
#Check the model has been saved
# my_model directory
!ls checkpoints

ls: cannot access 'checkpoints': No such file or directory


In [0]:
!ls

Colab_Model.ipynb	    Flickr8k_Dataset.zip      Flickr_8k.trainImages.txt
CrowdFlowerAnnotations.txt  Flickr_8k.devImages.txt   __MACOSX
descriptions.txt	    Flickr8k.lemma.token.txt  model.png
ExpertAnnotations.txt	    Flickr_8k.testImages.txt  readme.txt
features.pkl		    Flickr8k_text.zip
Flicker8k_Dataset	    Flickr8k.token.txt


In [0]:
# SAVE MODEL IN CASE CHECKPOINT DIDNT WORK
!mkdir -p saved_model
base_model.save('saved_model/base_model') 

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: saved_model/base_model/assets


In [30]:
#Check the model has been saved
# my_model directory
!ls saved_model

# Contains an assets folder, saved_model.pb, and variables folder.
!ls saved_model/base_model

base_model
assets	saved_model.pb	variables


In [36]:
!git init

Initialized empty Git repository in /content/drive/Shared drives/DL_AT3/Cloud_Folder/.git/


In [0]:
!git config — global user.email “You@Your.com”