In [None]:
!pip install pycocotools

from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab

import random
import string

import cv2
import os
from pickle import dump, load
import json

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, add
from tensorflow.keras.applications import VGG16
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

In [None]:
from tensorflow.keras.layers import Dropout, LSTM
from tensorflow.keras.utils import to_categorical

In [None]:
coco = COCO("/kaggle/input/cocods/annotations_trainval2017/annotations/instances_train2017.json")

#### Finding Categories and Sub-Categories

In [None]:
cats = coco.loadCats(coco.getCatIds())
maincategories = list(set([cat['supercategory'] for cat in cats]))

print("Number of main categories: ", len(maincategories))
print("List of main categories: ", maincategories)

In [None]:
subcategories = [cat['name'] for cat in cats]

print("Number of sub categories: ", len(subcategories))
print("List of sub categories: ", subcategories)

In [None]:
catIds = coco.getCatIds(catNms=subcategories)

subcategories_Ids = dict()
for i in range(0,len(subcategories)):
    subcategories_Ids[subcategories[i]] = catIds[i]

print("Sub categories with IDs :",subcategories_Ids)

In [None]:
subcategories_imageIds = dict()

for i in range(0,len(catIds)):
    imgIds = coco.getImgIds(catIds=catIds[i])
    img = []
    for j in imgIds: 
        img.append(j)
    subcategories_imageIds[subcategories[i]] = img

length_dict = {key: len(value) for key, value in subcategories_imageIds.items()}
print("Total images in each sub categories: ", length_dict)

Only Bicycle Images have been considered due to computational limitations.

In [None]:
train_cats = subcategories_imageIds['bicycle'] 
imgIdss = coco.getImgIds(imgIds = train_cats)
print("Total Images: ", len(imgIdss))

In [None]:
fig = plt.gcf()
fig.set_size_inches(9, 9)

next_pix = imgIdss
random.shuffle(next_pix)

for i, img_path in enumerate(next_pix[0:9]):
    
    sp = plt.subplot(3, 3, i + 1)
    sp.axis('Off')

    img = coco.loadImgs(img_path)[0]
    I = io.imread(img['coco_url'])
    plt.imshow(I)

plt.show()

In [None]:
annFile = "../input/cocods/annotations_trainval2017/annotations/captions_train2017.json"
coco_caps=COCO(annFile)

In [None]:
img = coco.loadImgs(next_pix[2])[0]
I = io.imread(img['coco_url'])
plt.imshow(I)
annIds = coco_caps.getAnnIds(imgIds=img['id']);
anns = coco_caps.loadAnns(annIds)
coco_caps.showAnns(anns)
plt.show()

#### Preparing Dataset

In [None]:
dataset = dict()
imgcaptions = []

for imgid in imgIdss:
    img = coco.loadImgs(imgid)[0]
    annIds = coco_caps.getAnnIds(imgIds=img['id']);
    anns = coco_caps.loadAnns(annIds)
    imgcaptions = []
    for cap in anns:
        
        cap = cap['caption'].translate(str.maketrans('', '', string.punctuation))
        
        cap = cap.replace("-"," ")
        
        cap = cap.split()
        cap = [word.lower() for word in cap]
        
        cap = '<start> ' + " ".join(cap) + ' <end>'
        imgcaptions.append(cap)
        
    dataset[img['coco_url']] = imgcaptions 
    
    
print("Length of Dataset: ",len(dataset))
print(dataset['http://images.cocodataset.org/train2017/000000047084.jpg'])

#### Tokenizing Captions

In [None]:
from itertools import chain
flatten_list = list(chain.from_iterable(dataset.values())) 

tokenizer = Tokenizer(oov_token='<oov>') 
tokenizer.fit_on_texts(flatten_list)
total_words = len(tokenizer.word_index) + 1

print("Vocabulary length: ", total_words)
print("Bicycle ID: ", tokenizer.word_index['bicycle'])
print("Airplane ID: ", tokenizer.word_index['airplane'])

In [None]:
model = VGG16(include_top=True)
transfer_layer = model.get_layer('fc2')
image_model_transfer = Model(inputs=model.input,
                             outputs=transfer_layer.output)

image_features = {}

for img in (dataset.keys()):
    image = io.imread(img)
    if image.ndim != 3:
        image = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB)
    
    image = cv2.resize(image,(224,224))
    image = np.expand_dims(image, axis=0)
 
    image = image/255.0

    feature = image_model_transfer.predict(image, verbose=0)
    image_features[img] = feature
    
print("Image features length: ", len(image_features))

In [None]:
image_features['http://images.cocodataset.org/train2017/000000047084.jpg'].shape

In [None]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)
    
max_length = max_length(dataset)
max_length

In [None]:
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():

            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield ([input_image, input_sequence], output_word)
            

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    
    for desc in desc_list:
        
        seq = tokenizer.texts_to_sequences([desc])[0]
        
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=total_words)[0]
            
            X1.append(feature) 
            X2.append(in_seq)  
            y.append(out_seq)  
            
    return np.array(X1), np.array(X2), np.array(y)

#### Model Architecture

In [None]:
def define_model(total_words, max_length):

    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(total_words, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(total_words, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    print(model.summary())

    return model

In [None]:
print('Dataset: ', len(dataset))
print('Descriptions: train=', len(dataset))
print('Photos: train=', len(image_features))
print('Vocabulary Size:', total_words)
print('Description Length: ', max_length)

model = define_model(total_words, max_length)
epochs=1
steps = len(dataset)

In [None]:
for i in range(epochs):
    generator = data_generator(dataset, image_features, tokenizer, max_length)
    model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("caption" + str(i) + ".h5")

In [None]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt


img_paths = ["../input/cocods/val2017/val2017/000000001761.jpg",
            "../input/cocods/val2017/val2017/000000022396.jpg",
            "../input/cocods/val2017/val2017/000000098520.jpg",
            "../input/cocods/val2017/val2017/000000101762.jpg",
            "../input/cocods/val2017/val2017/000000224051.jpg"]

def extract_features(filename, model):
        try:
            image = Image.open(filename)

        except:
            print("ERROR: Couldn't open image!")
        image = image.resize((224,224))
        image = np.array(image)
        
        if image.shape[2] == 4: 
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/255.0
        feature = model.predict(image)
        return feature

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None


def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        
        if word is None:
            break
        in_text += ' ' + word
        
        if word == 'end':
            break
    return in_text

In [None]:
pred_model = load_model('/kaggle/working/caption0.h5')

In [None]:
photo = extract_features(img_paths[3], image_model_transfer)
img = Image.open(img_paths[3])
description = generate_desc(pred_model, tokenizer, photo, 46)
print("\n\n")
print(description)
plt.imshow(img)