In [None]:
#  importing modules 
import os
import pickle 
import numpy as np 
import nltk 
import tensorflow as tf
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from PIL import Image
from numpy import asarray


## How download Dataset
1)First make an account on Kaggle. 
2) Login to your account
3) First click on Three dots present on window then click on create an api. It will download a kaggle.json file 
4) Then make a folder in your drive named kaggle. Upload that Kaggle.json in that folder 
5) Then Type name of dataset in search option in kaggle account. 
6) Click on three dots of that dataset. Copy api
7) use this "!(paste copied api)"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import drive 
drive.mount('/gdrive')

In [None]:
import os 
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/Kaggle'


In [None]:
%cd '/content/drive/MyDrive/Kaggle'

In [None]:
!kaggle datasets download -d adityajn105/flickr8k

In [None]:
 # load VGG16 model
model_1 = VGG16()
# restructure the model
model_1 = Model(inputs= model_1.inputs,outputs = model_1.layers[-2].output) #leaving the last layer that is prediction layer
model_1.summary()

In [None]:
from zipfile import ZipFile
  
# specifying the zip file name
file_name = "/content/drive/MyDrive/Kaggle/Flickr8k_image_caption_generator/flickr8k.zip"
  
# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()
  
    # extracting all the files
    # print('Extracting all the files now...')
    zip.extractall()
    # print('Done!')

In [None]:
# extraction features from image
features = {}
directory  = '/content/drive/MyDrive/Kaggle/Images'
for img_name in tqdm(os.listdir(directory)):
  img_path = directory + '/' + img_name # creating path to load image
  image = load_img(img_path,target_size = (224,224)) # loading images
  # converting images to numpy array
  image = img_to_array(image)
  # reshape data for model
  image = image.reshape(1,image.shape[0],image.shape[1],image.shape[2])

  # preprocessing image for VGG
  image = preprocess_input(image)
  # extract features
  feature = model_1.predict(image,verbose = 0)
  # get image ID
  image_id = img_name.split('.')[0]

  # storing features  
  features[image_id] = feature



In [None]:
features['973827791_467d83986e']

In [None]:
# store features in pickle 
pickle.dump(features, open(os.path.join('/content/drive/MyDrive/Kaggle','features.pkl'),'wb'))

In [None]:
# load features from pickle 
with open(os.path.join('/content/drive/MyDrive/Kaggle','features.pkl'),'rb') as f:
  features = pickle.load(f)

In [None]:
features

In [None]:
# load the captions data 
with open(('/content/drive/MyDrive/Kaggle/captions.txt'),'r') as f:
  next(f)
  captions_doc = f.read()


In [None]:
captions_doc

In [None]:
# create mapping of image to captions
mapping = {}
# processing image
for line in tqdm(captions_doc.split('\n')):
  # split the line by comma()
  tokens = list(line.split(','))
  # print(tokens[0][1:])
  if len(line) <2:
    continue 
  image_id, caption = tokens[0], tokens[1:]

  # removing extension from Image_id
  image_id = image_id.split('.')[0]
  # converting the caption list into string 
  caption = " ".join(caption)
  # print((image_id,caption))

  # print(caption)
  # creating list 
  if image_id not in mapping:
    mapping[image_id] = []
    # mapping[image_id].append(caption)
    #  storing the caption
  if image_id in mapping:
    mapping[image_id].append(caption)

In [None]:
mapping

In [None]:
# Cleaning 
def clean(mapping):
  for key,captions in mapping.items():
    for i in range(len(captions)):
      # take one caption at a time 
      caption = captions[i]
      # preprocessing steps
      caption = caption.lower() 
      caption = caption.replace('[^A-Za-z',"") #removing special characters etc
      caption = caption.replace('\s+'," ") #deleting additional spaces 
      caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq' #adding strting and ending in the caption 
      captions[i] = caption




In [None]:
# before preprocess of text 
mapping['1000268201_693b08cb0e']

In [None]:
# preprocessing the text 
clean(mapping)

In [None]:
# after preprocessing the text 
mapping['1000268201_693b08cb0e']

In [None]:
all_captions = []
for key in mapping:
  for caption in mapping[key]:
    all_captions.append(caption)
print(all_captions)

In [None]:
len(all_captions)

In [None]:
#  tokenizing the text 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
print(tokenizer.fit_on_texts(all_captions))
# for i in (tokenizer.word_index):
print(all_captions)
vocab_size = len(tokenizer.word_index) +1

In [None]:
print(len(tokenizer.word_index))

In [None]:
vocab_size

In [None]:
# Get maximum length of the caption available 
max_length =  max(len(caption.split()) for caption in all_captions)
max_length

## Train test split

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
print(split)
train = image_ids[:split]
test =  image_ids[split:]

In [None]:
#  creating data generator to get data in both(avoid session crashing)
def data_generator(data_keys,mapping,features,tokenizer,max_length,vocab_size,batch_size):
  # loop over images 
  X1,X2,y =  list(),list(),list()
  n = 0
  while 1:
    for key in data_keys:
      n += 1
      captions = mapping[key]

      #  processing each captions
      for caption in captions:
        # encode the Sequence 
        seq = tokenizer.texts_to_sequences([caption])[0]
        # split the sequence into X,y pairs
        for i in range(1, len(seq)):
          # split into input and output pairs
          in_seq, out_seq = seq[:i], seq[i]
          # padd input sequence 
          in_seq = pad_sequences([in_seq],maxlen = max_length)[0]
          # encode output sequence 
          out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]

          # store the sequence 
          # if key not in features:
          #   continue
          X1.append(features[key][0])
          X2.append(in_seq)
          y.append(out_seq)
      if n == batch_size:
        X1,X2,y = np.array(X1), np.array(X2), np.array(y)
        yield[X1,X2], y 
        X1, X2,  y = list(),list(),list()
        n=0
        


  

## Creating model

In [None]:
# encoder model
# image feature layers
inputs1 = Input(shape =(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256,activation = 'relu')(fe1)
# sequence feature layers 
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size,256,mask_zero = True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model 
decoder1 = add([fe2,se3])
decoder2 = Dense(256, activation = 'relu')(decoder1)
outputs = Dense(vocab_size,activation = 'softmax')(decoder2)

model_2 = Model(inputs =[inputs1,inputs2], outputs = outputs)
model_2.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# plot the model 
plot_model(model_2,show_shapes=True)

In [None]:
# training the model 
epochs = 20
batch_size = 32 
steps = len(train) // batch_size
for i in range(epochs):
  generator = data_generator(train,mapping,features,tokenizer,max_length,vocab_size,batch_size)
  # fit for one epoch
  model_2.fit(generator, epochs = 1 , steps_per_epoch = steps,verbose=1)

In [None]:
# saving the model
# model.save(Path of directory where you want to save the model'Img_cap_gen.h5')
model_2.save('/content/drive/MyDrive/Kaggle/Flickr8k_image_caption_generator/Img_cap_gen.h5')
# mapping[key]
# print(X1)

## Generate Captions for the image 


In [None]:
import h5py    
import numpy as np    
# f1 = h5py.File('/content/drive/MyDrive/Kaggle/Flickr8k_image_caption_generator/Img_cap_gen.h5','r+')
model_3 = tf.keras.models.load_model('/content/drive/MyDrive/Kaggle/Flickr8k_image_caption_generator/Img_cap_gen.h5')

In [None]:
def index_to_word(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return None


In [None]:
# for i in range(max_length):

#   in_text = 'startseq'
#     #  encode input Sequence
#   sequence  = tokenizer.texts_to_sequences([in_text])[0]
#     #  pad the sequence 
#   print(sequence)
#   sequence = pad_sequences([sequence],max_length)
#     # preidict next word
#   yhat = model_2.predict([image,sequence],verbose =0)
#     # get index with high probability 
#   yhat =  np.argmax(yhat)
#     #convert index to word
#   word = index_to_word(yhat,tokenizer)
#   print(word)

In [None]:
# generate caption for the image 
def predict_caption(model,image,tokenizer, max_length):
  # add start tag for geeneration process
  in_text = 'startseq' 
  # iterate over the max_length of sequence

  for i in range(max_length):
    #  encode input Sequence
    sequence  = tokenizer.texts_to_sequences([in_text])[0]
    #  pad the sequence 
    # print(sequence)
    sequence = pad_sequences([sequence],max_length)
    # preidict next word
    yhat = model.predict([image,sequence],verbose =0)
    # get index with high probability 
    yhat =  np.argmax(yhat)
    # print(yhat)
    #convert index to word
    word = index_to_word(yhat,tokenizer)
    # print(word)
    # stop is word not found 
    if word is None:
      break
      # append word as input for generating next word
    in_text += " "+ word
    # stop if we reach end tag 
    if word == 'endseq':
      break
  return in_text
# y_pred = predict_caption(model_2, features[key], tokenizer, max_length)
# print(y_pred)

In [None]:
from nltk.translate.bleu_score import corpus_bleu
#  validate with test data 
actual, predicted = list(), list()
for key in tqdm(test):
  #  get actual caption 
  captions = mapping[key]
  # predict the caption for range 
  y_pred = predict_caption(model_2, features[key], tokenizer, max_length)
  actual_captions = [caption.split() for caption in captions]
  y_pred = y_pred.split()
  #  append to the list 
  actual.append(actual_captions)
  predicted.append(y_pred)
#  calculate BLEU score
print('BLEU-1:%f' % corpus_bleu(actual,predicted, weights=(1,0,0,0,0)))
print('BLEU-2:%f' % corpus_bleu(actual,predicted, weights=(0.5,0.5,0,0,0)))


## Visualize the result 

In [None]:
import imageio as iio
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    # print(image_id)
    img_path = os.path.join('/content/drive/MyDrive/Kaggle/Images', image_name)
    # image_array = ()
    image = Image.open(img_path)
    # for i in image_array:
    #   print(i)
    #   # if i == image_name:
    #     # image = i
    

    # # print(image)
    # # plt.imshow(image)
    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model_2, features[image_id], tokenizer, max_length)
    # print(model_3)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)


In [None]:
print(features['1001773457_577c3a7d70'])

In [None]:
generate_caption("1001773457_577c3a7d70.jpg")