In [None]:
import pandas as pd
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import keras
import re
import nltk
from nltk.corpus import stopwords
import string
import json
from time import time
import pickle
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.layers import add
import zipfile
import os

#Downloading dataset

In [None]:
def download_flickr8k():
    # URLs for the dataset
    image_url = "https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip"
    text_url = "https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip"

    # Create a directory for the dataset
    os.makedirs("flickr8k", exist_ok=True)

    # Download and extract images
    print("Downloading Flickr8k images...")
    urllib.request.urlretrieve(image_url, "flickr8k/images.zip")
    with zipfile.ZipFile("flickr8k/images.zip", 'r') as zip_ref:
        zip_ref.extractall("flickr8k")
    os.remove("flickr8k/images.zip")

    # Download and extract text
    print("Downloading Flickr8k text data...")
    urllib.request.urlretrieve(text_url, "flickr8k/text.zip")
    with zipfile.ZipFile("flickr8k/text.zip", 'r') as zip_ref:
        zip_ref.extractall("flickr8k")
    os.remove("flickr8k/text.zip")

    print("Flickr8k dataset downloaded and extracted.")
download_flickr8k()

Downloading Flickr8k images...
Downloading Flickr8k text data...
Flickr8k dataset downloaded and extracted.


In [None]:
import os

# Check if the 'flickr8k' directory exists
if os.path.exists('flickr8k'):
    # List the contents of the 'flickr8k' directory
    print("Contents of the 'flickr8k' directory:")

else:
    print("The 'flickr8k' directory does not exist.")


Contents of the 'flickr8k' directory:


#Loading Descriptions

In [None]:
def load_descriptions(file_path):
    with open(file_path, 'r') as file:
        text = file.read()

    descriptions = {}
    for line in text.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in descriptions:
            descriptions[image_id] = []
        descriptions[image_id].append(image_desc)
    return descriptions

# Load descriptions from the token file
file_path = 'flickr8k/Flickr8k.token.txt'
descriptions = load_descriptions(file_path)

# Display a few descriptions to verify
for image_id, desc_list in list(descriptions.items())[:5]:
    print(f"{image_id}: {desc_list}")


1000268201_693b08cb0e: ['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']
1001773457_577c3a7d70: ['A black dog and a spotted dog are fighting', 'A black dog and a tri-colored dog playing with each other on the road .', 'A black dog and a white dog with brown spots are staring at each other in the street .', 'Two dogs of different breeds looking at each other on the road .', 'Two dogs on pavement moving toward each other .']
1002674143_1b742ab4b8: ['A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .', 'A little girl is sitting in front of a large painted rainbow .', 'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it .', 'There is a girl with pigtails sitting in front

#Cleaning the text

1. lower each word
2. remove puntuations
3. remove words less than length 1


In [None]:
import re

def clean_text(sample):
    sample = sample.lower()
    sample = re.sub("[^a-z]+", " ", sample)
    sample = sample.split()
    sample = [s for s in sample if len(s) > 1]
    sample = " ".join(sample)
    return sample

def clean_descriptions(descriptions):
    cleaned_descriptions = {}

    for image_id, captions in descriptions.items():
        cleaned_captions = [clean_text(caption) for caption in captions]
        cleaned_descriptions[image_id] = cleaned_captions

    return cleaned_descriptions

# Assuming 'descriptions' is already loaded
cleaned_descriptions = clean_descriptions(descriptions)

# Display a few cleaned descriptions to verify
for image_id, desc_list in list(cleaned_descriptions.items())[:5]:
    print(f"{image_id}: {desc_list}")


1000268201_693b08cb0e: ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']
1001773457_577c3a7d70: ['black dog and spotted dog are fighting', 'black dog and tri colored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']
1002674143_1b742ab4b8: ['little girl covered in paint sits in front of painted rainbow with her hands in bowl', 'little girl is sitting in front of large painted rainbow', 'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it', 'there is girl with pigtails sitting in front of rainbow painting', 'young girl with pigtails painting outside in the gra

In [None]:
dataset_dir = '/content/flickr8k'
output_file = os.path.join(dataset_dir, 'cleaned_descriptions.txt')

with open(output_file, "w") as f:
    f.write(str(cleaned_descriptions))

print(f"Cleaned descriptions saved to {output_file}")

# Step 4: Read the cleaned descriptions from the .txt file
with open(output_file, 'r') as f:
    descriptions = f.read()

# Convert the read descriptions back to a dictionary
json_acceptable_string = descriptions.replace("'", "\"")
descriptions = json.loads(json_acceptable_string)

# Display a few cleaned descriptions to verify
for image_id, desc_list in list(descriptions.items())[:5]:
    print(f"{image_id}: {desc_list}")

Cleaned descriptions saved to /content/flickr8k/cleaned_descriptions.txt
1000268201_693b08cb0e: ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']
1001773457_577c3a7d70: ['black dog and spotted dog are fighting', 'black dog and tri colored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']
1002674143_1b742ab4b8: ['little girl covered in paint sits in front of painted rainbow with her hands in bowl', 'little girl is sitting in front of large painted rainbow', 'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it', 'there is girl with pigtails sitting in front of

In [None]:

vocabulary = set()

for key in descriptions.keys():
    [vocabulary.update(i.split()) for i in descriptions[key]]

print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 8424


In [None]:
all_vocab = []

for key in cleaned_descriptions.keys():
    [all_vocab.append(i) for des in cleaned_descriptions[key] for i in des.split()]

print('Vocabulary Size: %d' % len(all_vocab))
print(all_vocab[:15])

Vocabulary Size: 373837
['child', 'in', 'pink', 'dress', 'is', 'climbing', 'up', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', 'girl']


In [None]:
len(all_vocab)

373837

In [None]:
#  count the frequency of each word, sort them and discard the words having frequency lesser than threshold value

import collections


counter= collections.Counter(all_vocab)

dic_ = dict(counter)

threshelod_value = 5

sorted_dic = sorted(dic_.items(), reverse=True, key = lambda x: x[1])
sorted_dic = [x for x in sorted_dic if x[1]>threshelod_value]
all_vocab = [x[0] for x in sorted_dic]

# **Loading training and testing images**

In [None]:
len(all_vocab)

2638

In [None]:
# TrainImagesFile
f = open("/content/flickr8k/Flickr_8k.trainImages.txt")
train = f.read()
f.close()

In [None]:
train  = [e.split(".")[0] for e in train.split("\n")[:-1]]

In [None]:
# TestImagesFile
f = open("/content/flickr8k/Flickr_8k.testImages.txt")
test = f.read()
f.close()

In [None]:
test  = [e.split(".")[0] for e in test.split("\n")[:-1]]

In [None]:
#ceate train_descriptions dictionary, which will be similar to earlier one, but having only train samples
# add startseq + endseq

train_descriptions = {}

for t in train:
    train_descriptions[t] = []
    for cap in descriptions[t]:
        cap_to_append = "startseq " + cap + " endseq"
        train_descriptions[t].append(cap_to_append)

In [None]:
train_descriptions['1000268201_693b08cb0e']

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

#**Data Preprocessing of Images**

In [None]:
model = ResNet50(weights="imagenet", input_shape=(224,224,3))

In [None]:
model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                       

In [None]:
# Create a new model, by removing the last layer (output layer of 1000 classes) from the resnet50
model_new = Model(model.input, model.layers[-2].output)

In [None]:
images = '/content/flickr8k/Flicker8k_Dataset/'

In [None]:
import time

In [None]:

# Load the pre-trained ResNet50 model and remove the top layer
base_model = ResNet50(weights='imagenet')
model_new = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

# Function to preprocess the image
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

# Function to encode image
def encode_image(img_path):
    img = preprocess_image(img_path)
    feature_vector = model_new.predict(img)
    feature_vector = feature_vector.reshape(feature_vector.shape[1],)
    return feature_vector

# Path to the file containing train image IDs
train_file_path = "/content/flickr8k/Flickr_8k.trainImages.txt"

# Base directory where images are stored
base_image_dir = "/content/flickr8k/Flicker8k_Dataset"

# Read the train image IDs from the file
with open(train_file_path, 'r') as file:
    train = file.read().strip().split('\n')

start = time.time()

encoding_train = {}

for ix, img_id in enumerate(train):
    img_path = os.path.join(base_image_dir, img_id)

    if not os.path.exists(img_path):
        print(f"Image not found: {img_path}")
        continue

    encoding_train[img_id] = encode_image(img_path)

    if ix % 100 == 0:
        print("Encoding image- " + str(ix))

print("Time taken in seconds =", time.time() - start)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Encoding image- 1100
Encoding image- 1200
Encoding image- 1300
Encoding image- 1400
Encoding image- 1500
Encoding image- 1600
Encoding image- 1700
Encoding image- 1800
Encoding image- 1900
Encoding image- 2000
Encoding image- 2100
Encoding image- 2200
Encoding image- 2300
Encoding image- 2400
Encoding image- 2500
Encoding image- 2600
Encoding image- 2700
Encoding image- 2800
Encoding image- 2900
Encoding image- 3000
Encoding image- 3100
Encoding image- 3200
Encoding image- 3300
Encoding image- 3400
Encoding image- 3500
Encoding image- 3600
Encoding image- 3700
Encoding image- 3800
Encoding image- 3900
Encoding image- 4000
Encoding image- 4100
Encoding image- 4200
Encoding image- 4300
Encoding image- 4400
Encoding image- 4500
Encoding image- 4600
Encoding image- 4700
Encoding image- 4800
Encoding image- 4900
Encoding image- 5000
Encoding image- 5100
Encoding image- 5200
Encoding image- 5300
Encoding image- 5400
Encoding im

In [27]:
import pickle

In [28]:
# Save the bottleneck train features to disk
with open("/content/flickr8k/encoded_train_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoding_train, encoded_pickle)


In [29]:
# Load the pre-trained ResNet50 model and remove the top layer
base_model = ResNet50(weights='imagenet')
model_new = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)


# Path to the file containing test image IDs
test_file_path = "/content/flickr8k/Flickr_8k.testImages.txt"

# Base directory where images are stored
base_image_dir = "/content/flickr8k/Flicker8k_Dataset"

# Read the test image IDs from the file
with open(test_file_path, 'r') as file:
    test = file.read().strip().split('\n')

start = time.time()

encoding_test = {}

for ix, img_id in enumerate(test):
    img_path = os.path.join(base_image_dir, img_id)

    if not os.path.exists(img_path):
        print(f"Image not found: {img_path}")
        continue

    encoding_test[img_id] = encode_image(img_path)

    if ix % 100 == 0:
        print("Encoding image- " + str(ix))

print("Time taken in seconds =", time.time() - start)

# Save the bottleneck test features to disk
with open("/content/flickr8k/encoded_test_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoding_test, encoded_pickle)

print("Test features saved successfully!")


Encoding image- 0
Encoding image- 100
Encoding image- 200
Encoding image- 300
Encoding image- 400
Encoding image- 500
Encoding image- 600
Encoding image- 700
Encoding image- 800
Encoding image- 900
Time taken in seconds = 481.3423593044281
Test features saved successfully!


In [30]:
 # Load the train images features from disk

with open("/content/flickr8k/encoded_train_images.pkl", "rb") as encoded_pickle:
    encoding_train = pickle.load(encoded_pickle)

In [31]:
# Load the test images features from disk

with open("/content/flickr8k/encoded_test_images.pkl", "rb") as encoded_pickle:
    encoding_test = pickle.load(encoded_pickle)

#**Data Preprocessing - Captions**

In [32]:
"""
word_to_idx is mapping between each unique word in all_vocab to int value
and idx_to_word is vice-versa
"""


ix = 1
word_to_idx = {}
idx_to_word = {}

for e in all_vocab:
    word_to_idx[e] = ix
    idx_to_word[ix] = e
    ix +=1

In [33]:
len(all_vocab)

2638

In [34]:
#  need to add these 2 words as well

word_to_idx['startseq'] = 2639
word_to_idx['endseq'] = 2640

idx_to_word[2639] = 'startseq'
idx_to_word[2640] = 'endseq'

In [35]:
#  vocab_size is total vocabulary len +1 because we will append 0's as well.

vocab_size = len(idx_to_word)+1
print(vocab_size)

2641


In [36]:
all_captions_len = []

for key in train_descriptions.keys():
    for cap in train_descriptions[key]:
        all_captions_len.append(len(cap.split()))

max_len = max(all_captions_len)
print(max_len)

35


Data Preparation using Generator Function

In [37]:
def data_generator(train_descriptions, encoding_train, word_to_idx, max_len, num_photos_per_batch):

    X1, X2, y = [], [], []

    n=0

    while True:

        for key, desc_list in train_descriptions.items():
            n +=1

            photo = encoding_train[key+".jpg"]

            for desc in desc_list:

                seq = [ word_to_idx[word] for word in desc.split() if word in word_to_idx]

                for i in range(1,len(seq)):

                    in_seq = seq[0:i]
                    out_seq = seq[i]

                    in_seq = pad_sequences([in_seq], maxlen=max_len, value=0, padding='post')[0]

                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            if n==num_photos_per_batch:
                yield [[np.array(X1), np.array(X2)], np.array(y)]
                X1, X2, y = [], [], []
                n=0


Word Embedding

In [38]:
# Download the GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip

# Create a directory to store the embeddings
!mkdir ./GloVE

# Unzip the embeddings into the created directory
!unzip glove.6B.zip -d ./GloVE


--2024-07-30 15:23:46--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-07-30 15:23:46--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-07-30 15:23:47--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [39]:
f = open("./GloVE/glove.6B.50d.txt", encoding='utf8')

In [40]:
embedding_index = {}

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float")

    embedding_index[word] = coefs

f.close()

In [41]:
def get_embedding_output():

    emb_dim = 50
    embedding_output = np.zeros((vocab_size,emb_dim))

    for word, idx in word_to_idx.items():
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None:
            embedding_output[idx] = embedding_vector

    return embedding_output


embedding_output = get_embedding_output()

In [42]:
embedding_output.shape

(2641, 50)

Model Architecture

In [43]:
# image feature extractor model

input_img_fea = Input(shape=(2048,))
inp_img1 = Dropout(0.5)(input_img_fea)
inp_img2 = Dense(256, activation='relu')(inp_img1)

In [44]:
# partial caption sequence model

input_cap = Input(shape=(max_len,))
inp_cap1 = Embedding(input_dim=vocab_size, output_dim=50, mask_zero=True)(input_cap)
inp_cap2 = Dropout(0.5)(inp_cap1)
inp_cap3 = LSTM(256)(inp_cap2)

In [45]:
decoder1 = add([inp_img2 , inp_cap3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# Merge 2 networks
model = Model(inputs=[input_img_fea, input_cap], outputs=outputs)

In [46]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 35)]                 0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 35, 50)               132050    ['input_5[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 2048)                 0         ['input_4[0][0]']             
                                                                                            

In [47]:
model.layers[2].set_weights([embedding_output])
model.layers[2].trainable = False

In [48]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [49]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [50]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')

In [51]:
number_pics_per_batch = 64  # Increased batch size for large dataset

In [52]:
steps = len(train_descriptions)//number_pics_per_batch

Train Our Model

In [53]:
import os

In [None]:
epochs = 30
os.makedirs('/content/flickr8k/model_weights/', exist_ok=True)
for i in range(epochs):
    generator = data_generator(train_descriptions, encoding_train, word_to_idx, max_len, number_pics_per_batch)
    history = model.fit_generator(generator,
                                  epochs=1,
                                  steps_per_epoch=steps,
                                  callbacks=[early_stopping, model_checkpoint])
    model.save('/content/flickr8k/model_weights/model_' + str(i) + '.h5')


  history = model.fit_generator(generator,








  saving_api.save_model(


