In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import Libraries

In [None]:
import os
import numpy as np
import string
from PIL import Image
from pickle import dump, load
import matplotlib.pyplot as plt
import argparse


from keras.applications.xception import Xception, preprocess_input
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import add
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


#Getting and performing data cleaning

In [None]:
# Loading a text file into memory
def load_text_file(filename):
    # Opening the file as read only
    file = open(filename, 'r')
    text_data = file.read()
    file.close()
    return text_data

# Retrieve all images and their captions from a file
def extract_image_captions(filename):
    # Load the content of the file
    file_content = load_text_file(filename)
    # Split the content into individual captions
    captions = file_content.split('\n')
    lenght_captions = round(len(captions) / 2)
    image_captions = {}

    # Process each caption and link it to the respective image
    for caption in captions[:lenght_captions]:
        img, caption = caption.split('\t')

        # Check if the image is already in the image_captions dictionary
        if img[:-2] not in image_captions:
            image_captions[img[:-2]] = [caption]
        else:
            image_captions[img[:-2]].append(caption)

    # Return a dictionary with images as keys and their associated captions as values
    return image_captions

# Data cleaning: Convert to lowercase, remove punctuation, and filter out words with numbers
def clean_text_data(captions):
    #creates a translation table that contains mapping information
    #of several characters.
    translation_table = str.maketrans('', '', string.punctuation)

    # Iterate through each image and its associated captions
    for image, image_captions in captions.items():
        for i, image_caption in enumerate(image_captions):
            # Replace hyphens with spaces
            image_caption = image_caption.replace("-", " ")
            words = image_caption.split()

            # Convert words to lowercase
            words = [word.lower() for word in words]

            # Remove punctuation from each word
            #the translate() function to replace these characters with their
            #corresponding characters in the table.
            words = [word.translate(translation_table) for word in words]

            # Remove words with a single character (e.g., 'a') and words with numbers
            words = [word for word in words if len(word) > 1 and word.isalpha()]

            # Convert the cleaned words back to a string
            image_captions = ' '.join(words)
            captions[image][i] = image_captions

    return captions

# Create a vocabulary containing all unique words from descriptions
def build_text_vocabulary(descriptions):
    # Initialize an empty set to store unique words
    vocabulary = set()

    # Iterate through each image key and its associated descriptions
    for image_key in descriptions.keys():
        # Update the vocabulary with words from each description
        [vocabulary.update(description.split()) for description in descriptions[image_key]]

    return vocabulary

 # Store all descriptions in a single file
def store_descriptions_in_file(descriptions, filename):
    lines = list()

    # Iterate through each image key and its associated descriptions
    for image_key, description_list in descriptions.items():
        for description in description_list:
            # Combine the image key and description with a tab separator
            lines.append(image_key + '\t' + description)

    # Join the lines with newline characters to create the data
    data = "\n".join(lines)

    # Open the file in write mode, write the data, and close the file
    file = open(filename, "w")
    file.write(data)
    file.close()

# Set the paths to the dataset folders in your project directory
text_data_folder = "/content/drive/MyDrive/DL/Flickr8k_text"
image_data_folder = "/content/drive/MyDrive/DL/Flicker8k_Dataset"

# Define the file path for the text data
text_data_file = text_data_folder + "/" + "Flickr8k.token.txt"

# Load the file containing all data and create a dictionary mapping images to their captions
image_captions = extract_image_captions(text_data_file)

# Clean the descriptions
cleaned_descriptions = clean_text_data(image_captions)

# Build the vocabulary of unique words from the cleaned descriptions
word_vocab = build_text_vocabulary(cleaned_descriptions)


# Save each cleaned description to a file
store_descriptions_in_file(cleaned_descriptions, "cleaned_descriptions.txt")


#Extracting the feature vector from all images

In [None]:
# Extract image features using the Xception model
def extract_image_features(directory):
    # Load the Xception model with the top layer removed and global average pooling
    model = Xception(include_top=False, pooling='avg')
    extracted_features = {}

    # Iterate through the images in the specified directory
    for image_filename in os.listdir(directory):
        full_path = directory + "/" + image_filename
        image = Image.open(full_path)
        image = image.resize((299, 299))
        image = np.expand_dims(image, axis=0)

        # Preprocess the image data
        image = image / 127.5
        image = image - 1.0

        # Extract image features using the model
        feature_vector = model.predict(image)
        extracted_features[image_filename] = feature_vector

    return extracted_features

# Extract 2048-dimensional feature vectors for images and save them to a file
image_features = extract_image_features(image_data_folder)
dump(image_features, open("image_features.p", "wb"))


#Loading dataset for Training the model

In [None]:
# Load data for training
def load_training_data(filename):
    # Load the list of photo filenames
    file = load_text_file(filename)
    photo_filenames = file.split("\n")[:-1]
    return photo_filenames

def load_cleaned_descriptions(filename, photo_filenames):
    # Load cleaned descriptions
    file = load_text_file(filename)
    descriptions = {}

    for line in file.split("\n"):
        words = line.split()

        if len(words) < 1:
            continue

        image_filename, image_caption = words[0], words[1:]

        if image_filename in photo_filenames:
            if image_filename not in descriptions:
                descriptions[image_filename] = []

            description = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image_filename].append(description)
    return descriptions

def load_selected_features(photo_filenames):
    # Load all image features and select only the needed ones
    all_image_features = load(open("image_features.p", "rb"))
    selected_features = {k: all_image_features[k] for k in photo_filenames}
    return selected_features

# Define the filename for training data
training_data_filename = text_data_folder + "/" + "Flickr_8k.trainImages.txt"

# Load training data
training_image_filenames = load_training_data(training_data_filename)
training_descriptions = load_cleaned_descriptions("cleaned_descriptions.txt",
                                                  training_image_filenames)
training_image_features = load_selected_features(training_image_filenames)

#Tokenizing the vocabulary

In [None]:
# Convert a dictionary of descriptions into a flat list
def descriptions_to_list(descriptions):
    all_descriptions = []
    for image_key in descriptions.keys():
        [all_descriptions.append(description) for description in
         descriptions[image_key]]
    return all_descriptions

# Create a text tokenizer to vectorize the text corpus
# Each integer will represent a token in the dictionary
def build_text_tokenizer(descriptions):
    description_list = descriptions_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(description_list)
    return tokenizer

# Build a tokenizer and save it as a pickle file
tokenizer = build_text_tokenizer(training_descriptions)
dump(tokenizer, open('text_tokenizer.p', 'wb'))
vocabulary_size = len(tokenizer.word_index) + 1

#Calculate the maximum length of descriptions
def calculate_max_description_length(descriptions):
    description_list = descriptions_to_list(descriptions)
    return max(len(description.split()) for description in description_list)

max_description_length = calculate_max_description_length(image_captions)
max_description_length

33

#Building the CNN-RNN (LSTM) model

In [None]:
# Define the image captioning model with additional layers
def create_captioning_model(vocab_size, max_sequence_length):
    # Define the input for image features
    image_input = Input(shape=(2048,))
    image_dropout = Dropout(0.5)(image_input)
    image_fc1 = Dense(512, activation='relu')(image_dropout)  # Additional dense layer
    image_fc2 = Dense(256, activation='relu')(image_fc1)

    # Define the input for text sequences
    text_input = Input(shape=(max_sequence_length,))
    text_embed = Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True)(text_input)
    text_dropout = Dropout(0.5)(text_embed)
    text_lstm1 = LSTM(256, return_sequences=True)(text_dropout)  # Additional LSTM layer
    text_lstm2 = LSTM(256)(text_lstm1)

    # Merge the image and text models
    merged = add([image_fc2, text_lstm2])
    merged_fc = Dense(256, activation='relu')(merged)
    output = Dense(vocab_size, activation='softmax')(merged_fc)

    # Create the model
    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # Summarize the model
    plot_model(model, to_file='captioning_model.png', show_shapes=True)

    return model


#Create Data generator

In [None]:
# Create input-output sequence pairs from image descriptions
def create_sequence_pairs(tokenizer, max_length, description_list, feature):
    X1, X2, y = list(), list(), list()
    # Process each description for the image
    for description in description_list:
        # Encode the text sequence
        sequence = tokenizer.texts_to_sequences([description])[0]
        # Split one sequence into multiple input-output pairs
        for i in range(1, len(sequence)):
            # Split into input and output pair
            input_sequence, output_word = sequence[:i], sequence[i]
            # Pad the input sequence
            input_sequence = pad_sequences([input_sequence], maxlen=max_length)[0]
            # Encode the output word
            output_word = to_categorical([output_word], num_classes=vocabulary_size)[0]
            # Store the pairs
            X1.append(feature)
            X2.append(input_sequence)
            y.append(output_word)
    return np.array(X1), np.array(X2), np.array(y)

# Data generator used in model.fit_generator()
def sequence_data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for image_key, description_list in descriptions.items():
            # Retrieve image features
            feature = features[image_key][0]
            input_image, input_sequence, output_word = create_sequence_pairs(tokenizer, max_length, description_list, feature)
            yield [[input_image, input_sequence], output_word]

#Training the model

In [None]:
# Create and compile the image captioning model
model = create_captioning_model(vocabulary_size, max_description_length)
num_epochs = 10
num_steps = len(training_descriptions)

# Create a directory 'models' to save the trained models
os.mkdir("models")

# Train the model for the specified number of epochs
for epoch in range(num_epochs):
    data_gen = sequence_data_generator(training_descriptions, training_image_features, tokenizer, max_description_length)
    model.fit_generator(data_gen, epochs=1, steps_per_epoch=num_steps, verbose=1)
    model.save("models/model_" + str(epoch) + ".h5")


#Testing the model

In [None]:
# Extract features from an image using a pre-trained model
def extract_image_features(filename, model):
    try:
        image = Image.open(filename)
    except:
        print("ERROR: Unable to open the image! Ensure that the image path and file extension are correct.")

    # Resize the image to a specific size
    image = image.resize((299, 299))
    image = np.array(image)

    # Convert images with 4 channels to 3 channels
    if image.shape[2] == 4:
        image = image[..., :3]

    # Prepare the image for model prediction
    image = np.expand_dims(image, axis=0)
    image = image / 127.5
    image = image - 1.0

    # Extract features from the image using the pre-trained model
    features = model.predict(image)
    return features

# Map an integer back to a word using a tokenizer
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# Generate a description for an image using a trained model
def generate_description(model, tokenizer, image_features, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([image_features, sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text

# Define the path to the image, maximum description length, tokenizer, model, and the pre-trained model
image_path = '/content/10815824_2997e03d76.jpg'
tokenizer = load(open("/content/text_tokenizer.p","rb"))
model = load_model('/content/models/model_9.h5')

# Extract features from the image
xception_model = Xception(include_top=False, pooling="avg")
photo_features = extract_image_features(image_path, xception_model)

# Open the image
image = Image.open(image_path)

# Generate a description for the image using the trained model and display it along with the image
description = generate_description(model, tokenizer, photo_features, max_description_length)
print("\n\n")
print(description)
plt.imshow(image)
