<a href="https://colab.research.google.com/github/DBagrecha/Deep-Learning/blob/main/Image_captions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
import glob
from PIL import Image
from time import time

from keras import Input, layers
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing import image
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.models import Model
from tensorflow.keras.utils import to_categorical

In [None]:
df=pd.read_csv("/content/drive/MyDrive/flickr30k_images/results.csv",sep = '|')
df.head()

In [None]:
df.rename({' comment_number': 'comment_number', ' comment': 'comment'}, axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.loc[df['image_name']=='1514957266.jpg']['comment_number']

In [None]:
df['comment_number']=list(map(lambda x : (x[1:]),df['comment_number']))

In [None]:
df['comment']=list(map(lambda x : str(x)[1:],df['comment']))

In [None]:
impath='/content/drive/MyDrive/flickr30k_images/images/'
x=plt.imread(impath+'85600252.jpg')
plt.imshow(x)
plt.show()
print(df.loc[df['image_name']=='1000092795.jpg']['comment'])

In [None]:
vocab=set()
for i in df['comment']:
  vocab.update(i.split(' '))
print(len(vocab))

In [None]:
df['image_path']=list(map(lambda x : impath+x,df['image_name']))
df.head()

In [None]:
df=df.drop(columns=['comment_number'])

In [None]:
df.head()

In [None]:
images=df['image_name'].unique()[:1500]
images

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['image_path'], df['comment'], test_size=0.25)

In [None]:
repeat=10
w=dict()
for i in y_train:
  for j in i.split(" "):
    if j in w.keys():
      w[j]+=1
    else:
      w[j]=1
new_vocab=[word for word in w.keys() if (w[word]>=repeat)]

In [None]:
len(new_vocab)

In [None]:
indtoword = {}
wordtoind = {}
ind = 1
for w in new_vocab:
    wordtoind[w] = ind
    indtoword[ind] = w
    ind += 1

vocab_size = len(indtoword) + 1

In [None]:
vocab_size

In [None]:
max_length=max([len(i.split(" ")) for i in df['comment']])  

In [None]:
glove_path='/content/drive/MyDrive/flickr30k_images/'
embeddings_index = {} 
f = open(os.path.join(glove_path, 'glove.6B.200d.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [None]:
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoind.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = InceptionV3(weights='imagenet')

In [None]:
model_new = Model(model.input, model.layers[-2].output)

In [None]:
def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [None]:
def encode(image):
    image = preprocess(image) 
    fea_vec = model_new.predict(image) 
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    return fea_vec

In [None]:
encoding_train = {}
for img in images:
    encoding_train[img] = encode(impath+img)
train_features = encoding_train

In [None]:
test_im=df['image_name'].unique()[-500:]

In [None]:
encoding_test = {}
for img in test_im:
    encoding_test[img] = encode(impath+img)
test_features = encoding_test

In [None]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
df['comment']=list(map(lambda x: 'start '+x+' end',df['comment']))

In [None]:
train_desc={}
for i in images:
  train_desc[i]=list(df.loc[df['image_name']==i]['comment'])

In [None]:
def data_generator(descriptions, photos, wordtoind, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            photo=photos[key]
            for desc in desc_list:
                
                seq = [wordtoind[word] for word in desc.split(' ') if word in wordtoind]
                
                for i in range(1, len(seq)):
                    
                    in_seq, out_seq = seq[:i], seq[i]
                    
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)

            if n==num_photos_per_batch:
                yield ([np.array(X1), np.array(X2)], np.array(y))
                X1, X2, y = list(), list(), list()
                n=0

In [None]:
epochs = 60
batch_size = 50
steps = len(train_desc)//batch_size

generator = data_generator(train_desc, train_features, wordtoind, max_length, batch_size)
model.fit(generator, epochs=epochs, steps_per_epoch=steps, verbose=1)

In [None]:
model.fit(generator, epochs=5, steps_per_epoch=steps, verbose=1)

In [None]:
def greedySearch(photo):
    in_text = 'start'
    for i in range(max_length):
        sequence = [wordtoind[w] for w in in_text.split() if w in wordtoind]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = indtoword[yhat]
        in_text += ' ' + word
        if word == 'end':
            break

    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
def beam_search_predictions(image, beam_index = 3):
    start = [wordtoind["start"]]
    start_word = [[start, 0.0]]
    while len(start_word[0][0]) < max_length:
        temp = []
        for s in start_word:
            par_caps = sequence.pad_sequences([s[0]], maxlen=max_length, padding='post')
            preds = model.predict([image,par_caps], verbose=0)
            word_preds = np.argsort(preds[0])[-beam_index:]
            # Getting the top <beam_index>(n) predictions and creating a 
            # new list so as to put them via the model again
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])
                    
        start_word = temp
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [indtoword[i] for i in start_word]
    final_caption = []
    
    for i in intermediate_caption:
        if i != 'end':
            final_caption.append(i)
        else:
            break

    final_caption = ' '.join(final_caption[1:])
    return final_caption

In [None]:
pic = '900144365.jpg'
image = encoding_test[pic].reshape((1,2048))
x=plt.imread(impath+pic)
plt.imshow(x)
plt.show()

print("Greedy Search:",greedySearch(image))
print("Beam Search, K = 3:",beam_search_predictions(image, beam_index = 3))
print("Beam Search, K = 5:",beam_search_predictions(image, beam_index = 5))
print("Beam Search, K = 7:",beam_search_predictions(image, beam_index = 7))
print("Beam Search, K = 10:",beam_search_predictions(image, beam_index = 10))