In [1]:
import tensorflow
import keras

print('tensorflow: %s' % tensorflow.__version__)
print('keras: %s' % keras.__version__)

tensorflow: 2.9.2
keras: 2.9.0


In [25]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from tensorflow.keras.layers import concatenate
from keras.callbacks import ModelCheckpoint

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

def extract_features(directory):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())
    features = dict()
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size = (224,224))
        image = img_to_array(image)
        image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
        image = preprocess_input(image)
        feature = model.predict(image, verbose = 0)
        image_id = name.split('.')[0]
        features[image_id] = feature
        #print('>%s' % name)
    return features
    
directory = '/content/drive/My Drive/flickr8k/images'
features = extract_features(directory)
print('Extracted features: %d' % len(features))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14758

In [17]:
#dump(features, open('/content/drive/My Drive/flickr8k/features.pkl','wb'))
with open('/content/drive/My Drive/flickr8k/features.pkl', 'wb') as outfile:
    dump(features, outfile)
print('features stored in features.pkl file')

features stored in features.pkl file


In [5]:
#import pickle

#with open('/content/drive/My Drive/flickr8k/features.pkl', 'rb') as infile:
#    features = pickle.load(infile)
#print('features stored in features.pkl file')
#dir = '/content/drive/My Drive/flickr8k/'

In [6]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/flickr8k/captions.txt')
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [7]:
import string

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# can use NLP here!!
def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

def clean_descriptions(descriptions):
    table = str.maketrans('','',string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word) > 1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] = ' '.join(desc)

def to_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

filename = '/content/drive/My Drive/flickr8k/Flickr8k_text/Flickr8k.token.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)
print('Loaded: %d' % len(descriptions))
clean_descriptions(descriptions)
voc = to_vocabulary(descriptions)
print('Vocabulary size: %d' % len(voc))

save_descriptions(descriptions, '/content/drive/My Drive/flickr8k/descriptions.txt')

Loaded: 8092
Vocabulary size: 8680


In [9]:
print(len(descriptions))

8091


In [15]:
directory = '/content/drive/My Drive/flickr8k/'
print(directory)

/content/drive/My Drive/flickr8k/


In [18]:
from pickle import load
 
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
 # skip empty lines
        if len(line) < 1:
            continue
 # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)
 
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
 # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
 # split line by white space
        tokens = line.split()
 # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
 # skip images not in the set
        if image_id in dataset:
 # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
 # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
 # store
            descriptions[image_id].append(desc)
    return descriptions
 
# load photo features
def load_photo_features(filename, dataset):
 # load all features
    all_features = load(open(filename, 'rb'))
 # filter features
    features = {k: all_features[k] for k in dataset}
    return features
 
# load training dataset (6K)
filename = directory + 'Flickr8k_text/Flickr_8k.trainImages.txt'
print(filename)
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions(directory + 'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features(directory + 'features.pkl', train)
print('Photos: train=%d' % len(train_features))

/content/drive/My Drive/flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt
Dataset: 6000
Descriptions: train=6000
Photos: train=6000


In [50]:
from keras.utils.np_utils import to_categorical
from keras.utils.data_utils import pad_sequences


def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
  
def create_tokenizer(desc):
    lines = to_lines(desc)
    tokenizer = tensorflow.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

def max_length_function(desc):
    lines = to_lines(desc)
    return max(len(d.split()) for d in lines)

def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

Vocabulary Size: 7507


In [34]:
def define_model(vocab_size, max_length):
    input1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(input1)
    fe2 = Dense(256, activation='relu')(fe1)
    input2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(input2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = concatenate([fe2, se3])
    decoder2 = Dense(256, activation = 'relu')(decoder1)
    outputs = Dense(vocab_size, activation = 'softmax')(decoder2)
    model = Model(inputs=[input1, input2], outputs=outputs)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    print(model.summary())
    return model

In [46]:
def data_generator(descriptions, photos, tokenizer, max_length):
    while 1:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]

In [48]:
filename = directory + 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions(directory + 'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features(directory + 'features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length_function(train_descriptions)
print('Description Length: %d' % max_length)
# prepare sequences
#X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7507
Description Length: 33


In [52]:
model = define_model(vocab_size, max_length)
epochs = 1
steps = len(train_descriptions)
for i in range(1):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save(directory + 'model_' + str(i) + '.h5')

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_25 (InputLayer)          [(None, 33)]         0           []                               
                                                                                                  
 input_24 (InputLayer)          [(None, 4096)]       0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 33, 256)      1921792     ['input_25[0][0]']               
                                                                                                  
 dropout_22 (Dropout)           (None, 4096)         0           ['input_24[0][0]']               
                                                                                           

  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)




In [59]:
from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu


def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen = max_length)
        yhat = model.predict([photo,sequence], verbose = 0)
        yhat = argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text
  
def evaluate_model(model, desc, photos, tokenizer, max_length):
    actual, pred = [], []
    for key, desc_list in desc.items():
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        references = [d.split() for d in desc_list]
        actual.append(references)
        pred.append(yhat.split())
    print('BLEU-1: %f' % corpus_bleu(actual, pred, weights=(1.0,0,0,0)))
    print('BLEU-2: %f' % corpus_bleu(actual, pred, weights=(0.5,0.5,0,0)))
    print('BLEU-3: %f' % corpus_bleu(actual, pred, weights=(0.3,0.3,0.3,0)))
    print('BLEU-4: %f' % corpus_bleu(actual, pred, weights=(0.25,0.25,0.25,0.25)))

filename = directory + 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions(directory + 'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features(directory + 'features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length_function(train_descriptions)
print('Description Length: %d' % max_length)

filename = directory + 'Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions(directory + 'descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features(directory + 'features.pkl', test)
print('Photos: test=%d' % len(test_features))

filename = directory + 'model_0.h5'
model = load_model(filename)
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7507
Description Length: 33
Dataset: 1000
Descriptions: test=1000
Photos: test=1000


KeyboardInterrupt: ignored

In [69]:
#Generate Captions for a Fresh Image

from pickle import load
from numpy import argmax
#from keras.preprocessing.sequence import pad_sequences
from keras.applications.vgg16 import VGG16
#from keras.preprocessing.image import load_img
#from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.models import load_model
import matplotlib.pyplot as plt

def extract_features_file(filename):
	# load the model
	model = VGG16()
	# re-structure the model
	model.layers.pop()
	model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
	# load the photo
	image = load_img(filename, target_size=(224, 224))
	# convert the image pixels to a numpy array
	image = img_to_array(image)
	# reshape data for the model
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	# prepare the image for the VGG model
	image = preprocess_input(image)
	# get features
	feature = model.predict(image, verbose=0)
	return feature

max_length2 = 33
npic = 5
npix = 224
count = 1
model = load_model(directory + 'model_0.h5')
fig = plt.figure(figsize=(10,20))
i = 0
for k in descriptions.keys():
    pic = extract_features_file(directory + '/images/' + k + '.jpg')
    pred_caption = generate_desc(model, tokenizer, pic, max_length2)
    image_load = load_img(directory + '/images/' + k + '.jpg', target_size=(224, 224,3))
    ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
    ax.imshow(image_load)
    count += 1

    ## captions
    ax = fig.add_subplot(npic,2,count)
    plt.axis('off')
    ax.plot()
    ax.set_xlim(0,1)
    ax.set_ylim(0,1)
    ax.text(0,0.5,pred_caption,fontsize=20)
    ax.text(0,0.5,descriptions[k][0],fontsize=20)
    count += 1
    i += 1
    if i == 5:
        break

plt.show()



ValueError: ignored

<Figure size 720x1440 with 0 Axes>