# Final Submission

#### loading the libraries

In [None]:

# linear algebra
from sklearn.model_selection import train_test_split
import numpy as np  
# data processing, CSV file I / O (e.g. pd.read_csv)
import pandas as pd  
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg19 import VGG19
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from keras.layers import concatenate, BatchNormalization, Input
from keras.utils import to_categorical, plot_model
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import matplotlib.pyplot as plt  # for plotting data
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
np.random.seed(0)
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
def load_description(text):
    mapping = dict()
    for line in text.split("\n"):
        token = line.split(",")
        if len(line) < 2:   # remove short descriptions
            continue
        img_id = token[0].split('.')[0] # name of the image
        img_des = token[1]              # description of the image
        if img_id not in mapping:
            mapping[img_id] = list()
        mapping[img_id].append(img_des)
    return mapping

token_path = f'{os.getcwd()}/captions.txt'
print(token_path)
text = open(token_path, 'r', encoding = 'utf-8').read()
descriptions = load_description(text)
print(descriptions['1000268201_693b08cb0e'])

In [None]:
import re
stopwords = nltk.corpus.stopwords.words('english')
stopwords_lower = [s.lower() for s in stopwords]
def text_preprocessing(str_input): 
     #tokenization, remove punctuation, lemmatization
     words=tokenizer.tokenize(str_input)
     # remove symbols, websites, email addresses 
     words = [re.sub(r'[^A-Za-z@]', ' ', word) for word in words] 
     words = [re.sub(r'\S+com', ' ', word) for word in words]
     words = [re.sub(r'\S+@\S+', ' ', word) for word in words] 
     words = [word for word in words if word!=' ']
     words = [word for word in words if len(word)!=0] 
     #remove stopwords     
     words=[word.lower() for word in words if word.lower() not in stopwords_lower and word.strip() != ""]
     #combine a list into one string   
     string = ' '.join(words)
     return string

In [None]:

def clean_description(desc):
    for key, des_list in desc.items():
        for i in range(len(des_list)):
            caption = des_list[i]
            caption=text_preprocessing(caption)
            des_list[i] = caption
  
clean_description(descriptions)
print(descriptions['1000268201_693b08cb0e'])

In [None]:
def to_vocab(desc):
    words = set()
    for key in desc.keys():
        for line in desc[key]:
            words.update(line.split())
    return words
vocab = to_vocab(descriptions)

In [None]:
import glob
images = f'{os.getcwd()}/images/'
# Create a list of all image names in the directory
img = glob.glob(images + '*.jpg')
# train_path = '/kaggle / input / flickr8k / flickr_data / Flickr_Data / Flickr_TextData / Flickr_8k.trainImages.txt'
# train_images = open(train_path, 'r', encoding = 'utf-8').read().split("\n")
train_img = []  # list of all images in training set
for im in img:
    if(im.split('/')[-1].split('.')[0] in list(descriptions.keys())):
        train_img.append(im.split('/')[-1].split('.')[0])
# load descriptions of training set in a dictionary. Name of the image will act as ey
def load_clean_descriptions(des, dataset):
    dataset_des = dict()
    for key, des_list in des.items():
        if key  in dataset:
            if key not in dataset_des:
                dataset_des[key] = list()
            for line in des_list:
                desc = 'startseq ' + line + ' endseq'
                dataset_des[key].append(desc)
    return dataset_des
  
train_descriptions = load_clean_descriptions(descriptions, train_img)
print(train_descriptions['1000268201_693b08cb0e'])

In [8]:
from tensorflow.keras.utils import load_img , img_to_array
def preprocess_img(img_path):
    # inception v3 excepts img in 299 * 299 * 3
    img = load_img(img_path, target_size = (224, 224))
    x = img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis = 0)
    x = preprocess_input(x)
    return x
encoding_train=dict()
def encode(image):
    image = preprocess_img(image)
    vec = model.predict(image)
    vec = np.reshape(vec, (vec.shape[1]))
    return vec
  
base_model = VGG19(weights = 'imagenet')
model = Model(base_model.input, base_model.layers[-2].output)
for i in range(len(train_img)):
    print(i)
    encoding_train[train_img[i]] = encode(f'{os.getcwd()}/images/'+train_img[i]+'.jpg')
# run the encode function on all train images and store the feature vectors in a list


87
88
89
90
91
92
93
94
95
96
97
98
99
100
101


KeyboardInterrupt: 

In [9]:
from tensorflow.keras.utils import pad_sequences
# list of all training captions
all_train_captions = []
for key, val in train_descriptions.items():
    for caption in val:
        all_train_captions.append(caption)
  
# consider only words which occur atleast 10 times
vocabulary = vocab
threshold = 10 # you can change this value according to your need
word_counts = {}
for cap in all_train_captions:
    for word in cap.split(' '):
        word_counts[word] = word_counts.get(word, 0) + 1
  
vocab = [word for word in word_counts if word_counts[word] >= threshold]
  
# word mapping to integers
ixtoword = {}
wordtoix = {}
  
ix = 1
for word in vocab:
    wordtoix[word] = ix
    ixtoword[ix] = word
    ix += 1
      
# find the maximum length of a description in a dataset
max_length = max(len(des.split()) for des in all_train_captions)


In [None]:

encoding_train

In [15]:
vocab_size = len(ixtoword)+1
X1, X2, y = list(), list(), list()
for key, des_list in train_descriptions.items():
	if key not in encoding_train:
		continue
	pic = encoding_train[key]
	for cap in des_list:
		seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix]
		for i in range(1, len(seq)):
			in_seq, out_seq = seq[:i], seq[i]
			in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
			out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
			# store
			X1.append(pic)
			X2.append(in_seq)
			y.append(out_seq)

X2 = np.array(X2)
X1 = np.array(X1)
print(X1.shape)
y = np.array(y)
df = pd.DataFrame({'a':X1, 'b':X2})
X_train, X_test,y_train, y_test = train_test_split(df,y ,
                                   random_state=104, 
                                   test_size=0.25, 
                                   shuffle=True)
# load glove vectors for embedding layer
embeddings_index = {}
golve_path =f'{os.getcwd()}/glove.6B.200d.txt'
glove = open(golve_path, 'r', encoding = 'utf-8').read()
for line in glove.split("\n"):
	values = line.split(" ")
	word = values[0]
	indices = np.asarray(values[1: ], dtype = 'float32')
	embeddings_index[word] = indices

emb_dim = 200
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, i in wordtoix.items():
	emb_vec = embeddings_index.get(word)
	if emb_vec is not None and emb_vec.shape[0]!=0:
		print(emb_vec)
		emb_matrix[i] = emb_vec
emb_matrix.shape

(3345, 4096)


ValueError: Per-column arrays must each be 1-dimensional

In [11]:
from tensorflow.keras.layers import add
# define the model
ip1 = Input(shape = (4096, ))
fe1 = Dropout(0.2)(ip1)
fe2 = Dense(256, activation = 'relu')(fe1)
ip2 = Input(shape = (max_length, ))
se1 = Embedding(vocab_size, emb_dim, mask_zero = True)(ip2)
se2 = Dropout(0.2)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation = 'relu')(decoder1)
outputs = Dense(vocab_size, activation = 'softmax')(decoder2)
model = Model(inputs = [ip1, ip2], outputs = outputs)

In [12]:

model.layers[2].set_weights([emb_matrix])
model.layers[2].trainable = False
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model.fit([X_train, X2], y, epochs = 5, batch_size = 256)
# you can increase the number of epochs for better results

ValueError: Data cardinality is ambiguous:
  x sizes: 2508, 3345
  y sizes: 3345
Make sure all arrays contain the same number of samples.

In [None]:
for i in encoding_train:
    pic=i
    image = encoding_train[pic].reshape((1,4096))
    x=plt.imread(f'{os.getcwd()}/images/{pic}.jpg')
    plt.imshow(x)
    plt.show()


    def greedy_search(pic):
        start = 'startseq'
        for i in range(max_length):
            seq = [wordtoix[word] for word in start.split() if word in wordtoix]
            seq = pad_sequences([seq], maxlen = max_length)
            yhat = model.predict([pic, seq])
            yhat = np.argmax(yhat)
            word = ixtoword[yhat]
            start += ' ' + word
            if word == 'endseq':
                break
        final = start.split()
        final = final[1:-1]
        final = ' '.join(final)
        return final

    print("Greedy Search:",greedy_search(np.array(image)))

In [None]:
from nltk.translate.bleu_score import sentence_bleu
reference = [i.split() for i in descriptions[pic]]
candidate = greedy_search(np.array(image)).split()

print('Individual 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Individual 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))
print('Individual 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))
print('Individual 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))