In [None]:
pip install googletrans==4.0.0-rc1

In [None]:
import os
import pickle
import numpy as np
from PIL import Image
from googletrans import Translator
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

### datasource: kaggle.com/datasets/adityajn105/flickr8k

In [None]:
# set up Google Translate client
translator = Translator()

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

captions_file = '/content/drive/MyDrive/captions.txt' 
images_folder = '/content/drive/MyDrive/cap_images'

Mounted at /content/drive/


In [None]:
# load vgg16 model for extracting features
model = VGG16()
# remove last layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

### image data

In [None]:
# extract features from image
features = {}

for img_id in os.listdir(images_folder):
    # load the image from file
    img_path = images_folder + '/' + img_id
    image = load_img(img_path, target_size=(224, 224))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_id.split('.')[0]
    # store feature
    features[image_id] = feature

In [None]:
# load features from the pickle file
with open(os.path.join('/content/drive/MyDrive/features.pkl'), 'rb') as f:
    features = pickle.load(f)

### caption data

In [None]:
# read captions from captions_file and create a dictionary from image_id to captions
with open(os.path.join(captions_file), 'r') as f:
    next(f)  # skip the header line
    img_caption_dict = {}
    for line in f:
        # split the line by comma(,)
        line = line.strip()
        if line:
          image_id, caption = line.split(',', 1)
          # remove .jpg
          image_id = image_id.split('.')[0]
          # convert caption list to string
          caption = " ".join(caption)
          if image_id not in img_caption_dict:
              img_caption_dict[image_id] = []
          # store the caption
          img_caption_dict[image_id].append(caption)

In [None]:
len(img_caption_dict)

8091

In [None]:
img_caption_dict['3767841911_6678052eb6']

['startseq baby girl in an orange dress gets wet as she stands next to water sprinkler endseq',
 'startseq blonde toddler wearing an orange dress is wet and standing beside sprinkler in yard endseq',
 'startseq child in dress is looking at sprinkler endseq',
 'startseq little girl in an orange dress is running through the sprinkler in the yard endseq',
 'startseq "on wet grass little blond girl in orange dress plays in sprinkler ." endseq']

In [None]:
img_caption_dict['619169586_0a13ee7c21']

['startseq woman in brown jacket is standing on rock with forested background endseq',
 'startseq woman posing near cliff endseq',
 'startseq woman standing in front of trees and smiling endseq',
 'startseq woman stands on mountain overlooking rolling field of trees underneath blue sky endseq',
 'startseq the person poses for picture on cliff overlooking valley endseq']

In [None]:
def caption_preprocessing(img_caption_dict):
    for img_id, captions in img_caption_dict.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # lowercasing
            caption = caption.lower()
            # remove chars
            caption = caption.replace('[^A-Za-z]', '')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# preprocess the text
caption_preprocessing(img_caption_dict)

In [None]:
next(iter(img_caption_dict.items()))

('1000268201_693b08cb0e',
 ['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
  'startseq girl going into wooden building endseq',
  'startseq little girl climbing into wooden playhouse endseq',
  'startseq little girl climbing the stairs to her playhouse endseq',
  'startseq little girl in pink dress going into wooden cabin endseq'])

In [None]:
Captions = []
for key in img_caption_dict:
    for caption in img_caption_dict[key]:
        Captions.append(caption)

In [None]:
len(Captions) # all words

40455

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Captions)
vocab_size = len(tokenizer.word_index) + 1
vocab_size # unique words

8485

In [None]:
# get caption maximum length
max_sequence_length = max(len(caption.split()) for caption in Captions)
max_sequence_length

35

In [None]:
split_ratio = 0.8
image_ids = list(img_caption_dict.keys())
split = int(len(image_ids) * split_ratio)
training_data_ids = image_ids[:split]
testing_data_ids = image_ids[split:]

In [None]:
# create data generator to get data in batches (avoid session crash)
def data_generator(img_ids, img_caption_dict, image_features, tokenizer, max_sequence_length, vocab_size, batch_size):
    image_input = [] 
    sequence_input = [] 
    output = []
    count = 0
    while True:
        for id in img_ids:
            count += 1
            captions = img_caption_dict[id]
            # process each caption
            for caption in captions:
                # encode the sequence
                sequence = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into input and output pairs
                for i in range(1, len(sequence)):
                    # split into input and output pairs
                    input_seq, output_seq = sequence[:i], sequence[i]
                    # Pad the input sequence
                    input_seq = pad_sequences([input_seq], maxlen=max_sequence_length)[0]
                    # encode the output sequence
                    output_seq = to_categorical([output_seq], num_classes=vocab_size)[0]
                    # store the sequences
                    image_input.append(image_features[id][0])
                    sequence_input.append(input_seq)
                    output.append(output_seq)
            # yield the batch when the batch size is reached
            if count == batch_size:
                image_input = np.array(image_input)
                sequence_input = np.array(sequence_input)
                output = np.array(output)
                yield [image_input, sequence_input], output
                # reset
                image_input = [] 
                sequence_input = [] 
                output = []
                count = 0

In [None]:
# Encoder model
# image feature layers
encoder_inputs = Input(shape=(4096,))
encoder_fe1 = Dropout(0.5)(encoder_inputs)
encoder_fe2 = Dense(256, activation='relu')(encoder_fe1)

# input sequence layers
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_se1 = Embedding(vocab_size, 256, mask_zero=True)(decoder_inputs)
decoder_se2 = Dropout(0.5)(decoder_se1)
decoder_se3 = LSTM(256)(decoder_se2)

# decoder & encoder
decoder_merged = add([encoder_fe2, decoder_se3])
decoder_dense1 = Dense(256, activation='relu')(decoder_merged)
decoder_outputs = Dense(vocab_size, activation='softmax')(decoder_dense1)


model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# training_data_ids the model
epochs = 10
batch_size = 32
steps = len(training_data_ids) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(training_data_ids, img_caption_dict, features, tokenizer, max_sequence_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)



In [None]:
# save the model
model.save('/content/drive/MyDrive/best_model.h5')

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    caption = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([caption])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        predicted_word = model.predict([image, sequence], verbose=0)
        # get index with high probability
        predicted_word = np.argmax(predicted_word)
        # convert index to word
        word = idx_to_word(predicted_word, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        caption += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break

    return caption

In [None]:
vgg_model = VGG16()
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)

In [None]:
image_path = 'yandere.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'رجل في القميص الوردي يقف في الشارع'

In [None]:
image_path = 'nurse.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'امرأة في القميص الوردي تقف في الشارع'

In [None]:
image_path = 'girl.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'فتاة صغيرة في اللباس الوردي تلعب مع خرطوم'

In [None]:
image_path = 'leon.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'رجل في قميص أزرق وسترة زرقاء يسير في الشارع'

In [None]:
image_path = 'human.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'رجل يرتدي قميصًا أسود وقميصًا أسود يتحدث'

In [None]:
image_path = 'dragon.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'رجل يرتدي قميصًا أحمر معلقًا على الجدار'

In [None]:
image_path = 'harrypoter.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'رجل في قميص أسود وقميص أسود يلعب مع رأسه في الهواء'

In [None]:
image_path = 'skating.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'تم طرح الرجل في الجزء العلوي من القارب في الجزء العلوي من القارب'

In [None]:
image_path = 'dog.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'يمتد الكلب على طول الشاطئ'

In [None]:
image_path = 'old.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'رجل وامرأة يسيران على روكي هيلسايد'

In [None]:
image_path = 'man.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict 
predicted_caption = predict_caption(model, feature, tokenizer, 35).replace('startseq', '').replace('endseq', '').strip()
translated_caption = translator.translate(predicted_caption, src="en", dest="ar")
translated_text = translated_caption.text
translated_text

'امرأة مع نظارات ونظارات تبتسم'