In [2]:
import string
from pickle import load

In [3]:
def load_file(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

def collection_data(text):
    mapping = dict()
    for line in text.split('\n'):            
        token = line.split()
        if len(line) < 2:
            continue
        image_id,image_desc = token[0],token[1:]
        image_id = token[0].split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

In [4]:
def clean_data(desc):
    table = str.maketrans('','',string.punctuation)
    for key,desc_list in desc.items():
        for i in range(len(desc_list)):
            text=desc_list[i]
            text=text.split()
            text=[word.lower() for word in text]
            text=[word.translate(table) for word in text]
            text=[word for word in text if len(word) > 2]
            text=[word for word in text if word.isalpha()]
            desc_list[i] = ' '.join(text)
    return desc

In [5]:
def save_text(desc,filename):
    lines = list()
    file = open(filename,'w')
    for key,desc_list in desc.items():
        for i in desc_list:
            string = key + ' ' + i
            lines.append(string)
            file.write(string + '\n')
    file.close()

In [6]:
def to_voc(desc):
    all_desc = set()
    for key in desc.keys():
        [all_desc.update(d.split()) for d in desc[key]]
    return all_desc

In [7]:
filename = 'flickr30k_images/captions.txt'
doc = load_file(filename)
description = collection_data(doc)
clean_desc = clean_data(description)
voc = to_voc(description)

In [8]:
len(description)

31783

In [9]:
file = 'clean2.txt'
save_text(clean_desc,file)

In [10]:
def load_set(filename):
    doc = load_file(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1 :
            continue
        identifer = line.split('.')[0]
        dataset.append(identifer)
    return set(dataset)

In [11]:
def load_desc(filename,dataset):
    doc = load_file(filename)
    desc = dict()
    for line in doc.split('\n'):
        token=line.split()
        image_id , image_desc = token[0],token[1:]
        if image_id in dataset:
            if image_id not in desc:
                desc[image_id] = list()
            desc_end = 'startseq ' + ' '.join(image_desc) + ' endseq'
            desc[image_id].append(desc_end)
    return desc

In [12]:
def load_features(filename,dataset):
    all_features = load(open(filename,'rb'))
    features = {k:all_features[k] for k in dataset}
    return features

In [13]:
trainfile = 'flickr30k_images/train.txt'
train = load_set(trainfile)
print("Dataset : {}".format(len(train)))

Dataset : 31783


In [14]:
train_desc = load_desc(file,train)
print("Descriptions : {}".format(len(train_desc)))

Descriptions : 31783


In [15]:
photo_features = load_features('Image_Caption_Features_Extract2.pkl',train)
print("Features = {}".format(len(photo_features)))

Features = 31783


In [16]:
from numpy import array
import keras
import tensorflow
from pickle import load 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model , to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense , Input , LSTM , Embedding , Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import add 
from tensorflow.keras.preprocessing.image import load_img , img_to_array 
from tensorflow.keras.applications.mobilenet import preprocess_input
import pickle

In [17]:
def to_lines(desc):
    all_desc = list()
    for key in desc.keys():
        [all_desc.append(d) for d in desc[key]]
    return all_desc

In [18]:
all_desc = to_lines(train_desc)
print(len(all_desc))

158915


In [19]:
def create_tokenizer(desc):
    lines = to_lines(desc)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [20]:
tokenizer = create_tokenizer(train_desc)

In [21]:
pickle.dump(tokenizer, open('tokenizer2.p', 'wb'))

In [22]:
max_length = max(len(caption.split()) for caption in all_desc)
max_length

66

In [23]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

19604


In [24]:
def create_seq(token, max_length, desc_list, photo):
    x1, x2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq , out_seq = seq[:i] , seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            x1.append(photo)
            x2.append(in_seq)
            y.append(out_seq)
    return array(x1), array(x2), array(y)

In [25]:
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(1280,))
    fe1 = Dropout(0.001)(inputs1)
    fe2 = Dense(224,activation='relu')(fe1)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size,224,mask_zero=True)(inputs2)
    se2 = Dropout(0.001)(se1)
    se3 = LSTM(224)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(224, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    model = keras.models.Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [26]:
model2 = define_model(vocab_size,max_length)

In [27]:
model2.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 66)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1280)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 66, 224)      4391296     ['input_2[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 1280)         0           ['input_1[0][0]']                
                                                                                              

In [34]:
def data_gen(desc, photos, token, max_length , number_img):
    n = 0
    while 1:
        for key, desc_list in desc.items():
            n += 1
            photo = photos[key][0]
            in_img , in_seq, out_word = create_seq(token, max_length, desc_list, photo)
            if n == number_img:
                yield[[in_img, in_seq], out_word]
                in_img = list()
                in_seq = list()
                out_word = list()
                n = 0

In [37]:
epochs = 50
number_img = 50
batch_size = len(train_desc) // number_img
for i in range(epochs):
    generator = data_gen(train_desc, photo_features, tokenizer, max_length,number_img)
    model2.fit(generator, epochs=1, steps_per_epoch=batch_size, verbose=1)



In [38]:
model2.save('Best_Model.h5')