In [4]:
# Importation des bibliothèques
import pandas as pd
import nltk
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.utils import load_img, img_to_array, to_categorical, plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import add
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from os import listdir
from joblib import dump, load
import string
from numpy import array

In [2]:
# Téléchargement des données de NLTK if required
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Préparer les données photo

In [5]:
# Extraction des features des photos
def extract_features(directory):
	# Chargement du modèle VGG (Visual Geometry Group) depuis Keras
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		img = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		img = img_to_array(img)
		# reshape data for the model
		img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
		# prepare the image for the VGG model
		img = preprocess_input(img)
		# get features
		feature = model.predict(img, verbose=0)
		# get image id
		img_id = name.split('.')[0]
		# store feature
		features[img_id] = feature
		print('>%s' % name)
	return features

In [3]:
# extract features from all images
directory = '../Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, 'features.joblib')

# Last execution time: 93m 22.5s

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

KeyboardInterrupt: 

# Préparer les données texte

In [6]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


filename = '../Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)

In [7]:
# extract descriptions for images
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping

# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [8]:
def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

# clean descriptions
clean_descriptions(descriptions)

In [9]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc
 
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 8763


In [10]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# save descriptions
save_descriptions(descriptions, 'descriptions.txt')

# Développer un modèle d'apprentissage en profondeur

### Loading Data

In [11]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [12]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

In [13]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

In [14]:
# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(filename)
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features

In [15]:
# load training dataset (6K)
filename = '../Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# photo features
train_features = load_photo_features('features.joblib', train)
print('Photos: train=%d' % len(train_features))

Dataset: 6000
Descriptions: train=6000
Photos: train=6000


In [16]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [17]:
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [18]:
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 7579


In [19]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each image identifier
	for key, desc_list in descriptions.items():
		# walk through each description for the image
		for desc in desc_list:
			# encode the sequence
			seq = tokenizer.texts_to_sequences([desc])[0]
			# split one sequence into multiple X,y pairs
			for i in range(1, len(seq)):
				# split into input and output pair
				in_seq, out_seq = seq[:i], seq[i]
				# pad input sequence
				in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
				# encode output sequence
				out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
				# store
				X1.append(photos[key][0])
				X2.append(in_seq)
				y.append(out_seq)
	return array(X1), array(X2), array(y)

In [20]:
# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

In [21]:
# train dataset

# load training dataset (6K)
filename = '../Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# photo features
train_features = load_photo_features('features.joblib', train)
print('Photos: train=%d' % len(train_features))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

# prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)

# Execution time = 6m 11.8s

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7579
Description Length: 34


In [22]:
# load test set
filename = '../Flickr8k_text/Flickr_8k.devImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))

# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))

# photo features
test_features = load_photo_features('features.joblib', test)
print('Photos: test=%d' % len(test_features))

# prepare sequences
X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features, vocab_size)

# Execution time = 17.6s

Dataset: 1000
Descriptions: test=1000
Photos: test=1000


### Defining the Model

In [23]:
# define the captioning model
def define_model(vocab_size, max_length):
	# feature extractor model
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence model
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	se3 = LSTM(256)(se2)
	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	# summarize model
	print(model.summary())
	plot_model(model, to_file='model.png', show_shapes=True)
	return model

### Fitting the Model

In [24]:
# define the model
model = define_model(vocab_size, max_length)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 34)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 34, 256)      1940224     ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_2[0][0]']                
                                                                                            

In [25]:
# define checkpoint callback
filepath = './models/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

In [26]:
# initial fit model
model.fit([X1train, X2train], ytrain, epochs=20, verbose=2, callbacks=[checkpoint, early_stopping], validation_data=([X1test, X2test], ytest))

Epoch 1/20


In [30]:
# Fit model from checkpoint file
from keras.models import load_model

# Loading last saved checkpoint file (.h5 files)
model = load_model('./model-ep004-loss3.690-val_loss3.911.h5')

# Compiling model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Loading weights
model.load_weights('./model-ep004-loss3.690-val_loss3.911.h5')

# Restart fit model
model.fit([X1train, X2train], ytrain, epochs=16, verbose=2, callbacks=[checkpoint, early_stopping], validation_data=([X1test, X2test], ytest))

Epoch 1/16

Epoch 1: val_loss improved from inf to 3.93679, saving model to model-ep001-loss3.703-val_loss3.937.h5
9576/9576 - 1634s - loss: 3.7029 - val_loss: 3.9368 - 1634s/epoch - 171ms/step
Epoch 2/16

Epoch 2: val_loss improved from 3.93679 to 3.93383, saving model to model-ep002-loss3.610-val_loss3.934.h5
9576/9576 - 1400s - loss: 3.6096 - val_loss: 3.9338 - 1400s/epoch - 146ms/step
Epoch 3/16

Epoch 3: val_loss improved from 3.93383 to 3.93289, saving model to model-ep003-loss3.546-val_loss3.933.h5
9576/9576 - 2145s - loss: 3.5457 - val_loss: 3.9329 - 2145s/epoch - 224ms/step
Epoch 4/16

Epoch 4: val_loss did not improve from 3.93289
9576/9576 - 1625s - loss: 3.4956 - val_loss: 3.9394 - 1625s/epoch - 170ms/step
Epoch 5/16

Epoch 5: val_loss did not improve from 3.93289
9576/9576 - 2148s - loss: 3.4559 - val_loss: 3.9350 - 2148s/epoch - 224ms/step
Epoch 6/16

Epoch 6: val_loss did not improve from 3.93289
9576/9576 - 1496s - loss: 3.4240 - val_loss: 3.9764 - 1496s/epoch - 156ms/s

KeyboardInterrupt: 

Stop fitting/training model process at epoch 10 due to time requirement (~30 mins per epoch) and the fact that the model didn't improve.

### Evaluating model

In [30]:
from numpy import argmax
from joblib import load
from nltk.translate.bleu_score import corpus_bleu
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
	# seed the generation process
	in_text = 'startseq'
	# iterate over the whole length of the sequence
	for i in range(max_length):
		# integer encode input sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		# pad input
		sequence = pad_sequences([sequence], maxlen=max_length)
		# predict next word
		yhat = model.predict([photo,sequence], verbose=0)
		# convert probability to integer
		yhat = argmax(yhat)
		# map integer to word
		word = word_for_id(yhat, tokenizer)
		# stop if we cannot map the word
		if word is None:
			break
		# append as input for generating the next word
		in_text += ' ' + word
		# stop if we predict the end of the sequence
		if word == 'endseq':
			break
	return in_text
 
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
        # calculate BLEU score
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [31]:
# prepare tokenizer on train set
 
# load training dataset (6K)
filename = '../Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

Dataset: 6000
Descriptions: train=6000


In [32]:
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 7579


In [33]:
# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 34


In [34]:
# prepare test set
 
# load test set
filename = '../Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))

Dataset: 1000


In [35]:
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))

Descriptions: test=1000


In [36]:
# photo features
test_features = load_photo_features('features.joblib', test)
print('Photos: test=%d' % len(test_features))

Photos: test=1000


In [37]:
# load the model
filename = './model-ep007-loss3.546-val_loss3.933.h5'
model = load_model(filename)

OSError: No file or directory found at ./model-ep007-loss3.546-val_loss3.933.h5

In [40]:
# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.538462
BLEU-2: 0.211830
BLEU-3: 0.000000
BLEU-4: 0.000000
BLEU-1: 0.538462
BLEU-2: 0.211830
BLEU-3: 0.000000
BLEU-4: 0.000000
BLEU-1: 0.461538
BLEU-2: 0.196116
BLEU-3: 0.000000
BLEU-4: 0.000000
BLEU-1: 0.489362
BLEU-2: 0.213359
BLEU-3: 0.000000
BLEU-4: 0.000000
BLEU-1: 0.483333
BLEU-2: 0.229624
BLEU-3: 0.127914
BLEU-4: 0.000000
BLEU-1: 0.500000
BLEU-2: 0.237595
BLEU-3: 0.126196
BLEU-4: 0.000000
BLEU-1: 0.539474
BLEU-2: 0.293263
BLEU-3: 0.210501
BLEU-4: 0.119182
BLEU-1: 0.505618
BLEU-2: 0.262038
BLEU-3: 0.187344
BLEU-4: 0.103728
BLEU-1: 0.515464
BLEU-2: 0.275949
BLEU-3: 0.201790
BLEU-4: 0.108326
BLEU-1: 0.500000
BLEU-2: 0.254951
BLEU-3: 0.185051
BLEU-4: 0.097475
BLEU-1: 0.487805
BLEU-2: 0.237950
BLEU-3: 0.171509
BLEU-4: 0.088839
BLEU-1: 0.507353
BLEU-2: 0.255861
BLEU-3: 0.173671
BLEU-4: 0.087437
BLEU-1: 0.503356
BLEU-2: 0.250838
BLEU-3: 0.166861
BLEU-4: 0.082579
BLEU-1: 0.506173
BLEU-2: 0.254915
BLEU-3: 0.173441
BLEU-4: 0.083449
BLEU-1: 0.508571
BLEU-2: 0.258360
BLEU-3: 0.1788

### Generating new captions

In [38]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [39]:
# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

In [40]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions

In [41]:
# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

In [42]:
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [44]:
# load training dataset (6K)
filename = '../Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
# save the tokenizer
dump(tokenizer, open('tokenizer.joblib', 'wb'))

Dataset: 6000
Descriptions: train=6000


In [45]:
# load the tokenizer
tokenizer = load(open('tokenizer.joblib', 'rb'))
# pre-define the max sequence length (from training)
max_length = 34

In [46]:
# load the model
model = load_model('./models/model-ep007-loss3.546-val_loss3.933.h5')

In [47]:
# extract features from each photo in the directory
def extract_features(filename):
	# load the model
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# load the photo
	image = load_img(filename, target_size=(224, 224))
	# convert the image pixels to a numpy array
	image = img_to_array(image)
	# reshape data for the model
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	# prepare the image for the VGG model
	image = preprocess_input(image)
	# get features
	feature = model.predict(image, verbose=0)
	return feature

# load and prepare the photograph
photo = extract_features('../example.jpg')

In [48]:
# generate description
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

startseq black dog is running through the water endseq


In [49]:
# Removing startseq & endseq token from description
description = description.replace("startseq ", "").replace(" endseq", "")
print(description)

black dog is running through the water


In [50]:
from translate import Translator

# Create a translator object
translator = Translator(from_lang="en", to_lang="fr")

# Translate a piece of text
translation = translator.translate(description)

# Print the translation
print(translation)

un chien noir court dans l'eau


# Testing on personal photo

In [51]:
# load and prepare the photograph
photo = extract_features('../20230124_214558.jpg')

# generate description
description = generate_desc(model, tokenizer, photo, max_length)

# Removing startseq & endseq token from description
description = description.replace("startseq ", "").replace(" endseq", "")
print(description)

# Translate a piece of text
translation = translator.translate(description)

# Print the translation
print(translation)

man in red shirt is standing in front of the ocean
un homme en chemise rouge se tient devant l'océan
