<a href="https://colab.research.google.com/github/AnandVishnuG/ImageCaptioningLSTM/blob/main/ImageCaptioning_Finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow_addons
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard>=2.7
  Downloading typeguard-3.0.2-py3-none-any.whl (30 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.19.0 typeguard-3.0.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=3.0.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from matplotlib import pyplot as plt
from google.colab import files, drive
from tqdm import tqdm 
from PIL import Image
from pickle import dump, load
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Rescaling, Dense, Flatten, LSTM, Embedding, Dropout, add
from keras.applications import Xception
from keras.callbacks import TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical, plot_model
from tensorflow.io import read_file
from tensorflow.image import decode_jpeg, resize, convert_image_dtype
from keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow_addons.optimizers import MultiOptimizer

import tensorflow as tf
import numpy as np
import pandas as pd
import os

%load_ext tensorboard



The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [None]:
# Mount Google drive for accessing files
drive.mount('/content/drive')

NameError: ignored

In [None]:
# Read Captions file and create descriptions dictionary
def read_captions(filename):
# Reading captions from file
  captions = pd.read_csv(filename,encoding="cp1252")
  return captions

# Create a dictionary of images as keys and each image's descriptions as items
def data_to_dictionary(data):
    descriptions = {}

    for idx, row in data.iterrows():
        if row["image"] not in descriptions:
            descriptions.setdefault(row["image"], []).append(row["caption"])
        else:
            descriptions[row["image"]].append(row["caption"])
    return descriptions
# Create vocabulary of all words used in captions
def text_vocabulary(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
# Return list of image names to fetch
def load_photos(filename):
    file = open(filename, 'r')
    photos = file.read().split("\n")[:-1]
    return photos
# Return descriptions for the image names, padded with <start> and <end> 
def load_descriptions(photos):
    d = {}
    for photo in photos:
        if photo in descriptions.keys():
            if photo not in d.keys():
                l = []            
                for desc in descriptions[photo]:
                    l.append("<start> " + desc + " <end>" ) 
                d.setdefault(photo, l)
    return d

# Vectorize the data
def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer
#Calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)


In [None]:
# Define the Captioning model
def define_model(vocab_size, max_length):
    # Defining base model with no top layer
    base_model = Xception(input_shape=(299, 299, 3), include_top=False, pooling='avg') 
    # Setting training mode to False for all except the last 10 layers
    for layer in base_model.layers[:-10]:
        layer.trainable = False
    # Scaling images for optimization
    scale_layer = Rescaling(scale=1 / 127.5, offset=-1)
    # Input images of size 299 x 299 and 3 Channels
    input1 = Input((299,299,3))
    x1 = Rescaling(scale=1/127.5, offset= -1)(input1)
    # x1 = scale_layer(input1)
    x1 = base_model(x1)
    x1 = Flatten()(x1)
    # Regularizing with drop out
    x1 = Dropout(0.5)(x1)
    # Intermediate output layer of 256 features 
    x1 = Dense(256, kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001), activation='relu')(x1)
    # LSTM sequence model
    # Array of maximum length of descriptions
    input2 = Input(shape=(max_length,))
    # Embedding input of max_length dimensions into 256 dimensions output
    x2 = Embedding(vocab_size, 256, mask_zero=True)(input2)
    # Regularizing with dropout
    x2 = Dropout(0.5)(x2)
    # LSTM layer 256 
    x2 = LSTM(256, kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))(x2)
    # Regularizing with dropout
    x2 = Dropout(0.5)(x2)
    # Merging both models
    x = add([x1, x2])
    # 128 neurons
    x = Dense(128, activation='relu',kernel_regularizer=l2(0.001), bias_regularizer=l2(0.001))(x)
    # Output layer
    output = Dense(vocab_size, activation='softmax')(x)
    # Combine into one model [image, seq] [word]
    model = Model(inputs=[input1, input2], outputs=output)
    # Initialize optimizers with different rates for base_model and fully connected layers
    optimizer_base_model_layers = Adam(learning_rate=0.0001)
    optimizer_fully_connected_layers = Adam()
    # Setting optimizers for each layer
    optimizers_and_layers = [(optimizer_base_model_layers, model.layers[3]), 
                            (optimizer_fully_connected_layers, model.layers[4:])]

    model.compile(loss='categorical_crossentropy', optimizer=MultiOptimizer(optimizers_and_layers))
    # summarize model
    print(model.summary())
    # Plotting the model
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
# Reading captions
captions = read_captions("/content/drive/MyDrive/Colab Notebooks/data/captions_clean.txt")
# Creating dictionary for descriptions
descriptions = data_to_dictionary(captions)


In [None]:
model = load_model("/content/model_10_finetuned.h5")
# tokenizer = load(open("/content/tokenizer (1).p","rb"))


In [None]:
# Training data
train_photos   = load_photos("/content/drive/MyDrive/Colab Notebooks/data/Flickr_8k.trainImages.txt")
train_desc     = load_descriptions(train_photos)
# Valiation data
val_photos   = load_photos("/content/drive/MyDrive/Colab Notebooks/data/Flickr_8k.devImages.txt")
val_desc     = load_descriptions(train_photos)
# Test data
test_photos   = load_photos("/content/drive/MyDrive/Colab Notebooks/data/Flickr_8k.testImages.txt")
test_desc     = load_descriptions(train_photos)

# Convert training descriptions to list 
# desc_list = dict_to_list(train_desc)
# Create tokens from the training data
tokenizer = create_tokenizer(train_desc)
vocab_size = len(tokenizer.word_counts) + 1
# Description length
# max_length = max_length(descriptions)
print(f"Number of training images: {len(train_photos)} ")
print(f"Number of training descriptions: {len(train_desc)} ")
print(f"Vocabulary: {vocab_size} ")
# print(f"Description length: {max_length} ")

Number of training images: 6000 
Number of training descriptions: 6000 
Vocabulary: 6965 


In [None]:
dump(tokenizer, open('tokenizer.p', 'wb'))
    

In [None]:
#create input-output sequence pairs from the image description.
#data generator, used by model.fit_generator()
def data_generator(descriptions, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            #retrieve photo features
            img = tf.io.read_file("/content/drive/MyDrive/Colab Notebooks/data/Images/" + key)
            img = tf.image.decode_jpeg(img, channels=3)
            img = tf.image.resize(img, (299,299))
            img = tf.image.convert_image_dtype(img, tf.float32)
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, img)
            yield [[input_image, input_sequence], output_word]

def create_sequences(tokenizer, max_length, desc_list, image):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(image)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
# Training model
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_desc)
steps_val = len(val_desc)
histories = []

for i in range(epochs):
    generator_train = data_generator(train_desc, tokenizer, max_length)
    generator_val = data_generator(val_desc, tokenizer, max_length)
    history = model.fit(generator_train, validation_data= generator_val, epochs=1, steps_per_epoch= steps, validation_steps= steps_val, verbose=1)
    histories.append(history)
    model.save("model_" + str(i+1) + ".h5")

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling_7 (Rescaling)        (None, 299, 299, 3)  0           ['input_11[0][0]']               
                                                                                                  
 input_12 (InputLayer)          [(None, 38)]         0           []                               
                                                                                                  
 xception (Functional)          (None, 2048)         20861480    ['rescaling_7[0][0]']      

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089




In [None]:
from google.colab import files
files.download('/content/model_10.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
dump(histories, open('histories.p', 'wb'))
files.download('/content/tokenizer.p')
files.download('/content/histories.p')

In [None]:
model = load_model("/content/model_10_finetuned.h5")
# tokenizer_keras_reg = load(open("/content/tokenizer_keras_reg.p","rb"))
# features = load(open("/content/tokenizer (1).p","rb"))

In [None]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling_7 (Rescaling)        (None, 299, 299, 3)  0           ['input_11[0][0]']               
                                                                                                  
 input_12 (InputLayer)          [(None, 38)]         0           []                               
                                                                                                  
 xception (Functional)          (None, 2048)         20861480    ['rescaling_7[0][0]']      

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text.replace("start ","").replace(" end","").capitalize() + "."


In [None]:
from pandas import read_csv
import random
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from bert_score import score as BScore

smooth_func = SmoothingFunction().method1


for i in range(0,5):
    rand_idx = random.randint(0, len(test_photos)-1)
    img = tf.io.read_file("/content/drive/MyDrive/Colab Notebooks/data/Images/" + test_photos[rand_idx])
    img = tf.image.decode_jpeg(img, channels=3)
    image = img
    img = tf.image.resize(img, (299,299))
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.expand_dims(img, axis=0)

    description = generate_desc(model, tokenizer, np.array(img), max_length)
    print("\n\n")
    # Prediction
    print("Prediction: {}".format(description))
    print()
    # Description
    score = sentence_bleu(descriptions[test_photos[rand_idx]], description, smoothing_function=smooth_func)
    for desc in descriptions[test_photos[rand_idx]]:
        print("Reference: {} ".format(desc))
    print("BLEU: {}".format(score))  
    refs = []
    refs.append(description)
    refs.append(description)
    refs.append(description)
    refs.append(description)
    refs.append(description)
    p, r, f1 = BScore(refs, descriptions[test_photos[rand_idx]], lang='en', verbose=False)
    print("BERTScore: {:.4f}".format(f1.mean().item()))
    # tokenizer.tokenize()
    # print(descriptions[test_images[rand_idx]])
    # print("METEOR: {}".format(meteor_score(descriptions[test_images[rand_idx]], str(description))))
    # Sentence BLEU
    # total_score.append(score)
    plt.figure()
    plt.imshow(image)
    plt.show(block=True)

UnimplementedError: ignored

In [None]:
model_keras_reg = load_model("/content/model_9keras_reg.h5")
tokenizer_keras_reg = load(open("/content/tokenizer_keras_reg.p","rb"))
# features = load(open("/content/tokenizer (1).p","rb"))

In [None]:
from tensorflow.keras.applications import Xception

def extract_features(filename, model):
        try:
            image = Image.open(filename)

        except:
            print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
        image = image.resize((299,299))
        image = np.array(image)
        # for images that has 4 channels, we convert them into 3 channels
        if image.shape[2] == 4: 
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    
    # for word, index in tokenizer.items():
    #     if index == integer:
    #         return word
    return None


def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # sequence = text_to_sequence(tokenizer, in_text)
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text.replace("start ","").replace(" end","").capitalize() + "."

from pandas import read_csv
import random
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score


smooth_func = SmoothingFunction().method1

file = open('/content/drive/MyDrive/Colab Notebooks/data/Flickr_8k.testImages.txt',"r")
test_images = []

for filename in file.read().split("\n")[:-1]:
    test_images.append(filename)
# model = load_model("models\\model2_19.h5")
xception_model = Xception(include_top=False, pooling="avg")
for i in range(0,5):
    rand_idx = random.randint(0, len(test_images)-1)
    fname = "/content/drive/MyDrive/Colab Notebooks/data/Images/" + test_images[rand_idx]
    
    photo = extract_features (fname, xception_model)
    img = Image.open(fname)
    description = generate_desc(model, tokenizer, photo, max_length)
    print("\n\n")
    # Prediction
    print("Prediction: {}".format(description))
    print()
    # Description
    score = sentence_bleu(descriptions[test_images[rand_idx]], description, smoothing_function=smooth_func)
    for desc in descriptions[test_images[rand_idx]]:
        print("Reference: {} ".format(desc))
    print("BLEU: {}".format(score))  
    # tokenizer.tokenize()
    # print(descriptions[test_images[rand_idx]])
    # print("METEOR: {}".format(meteor_score(descriptions[test_images[rand_idx]], str(description))))
    # Sentence BLEU
    # total_score.append(score)
    plt.figure()
    plt.imshow(img)
    plt.show(block=True)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


UnimplementedError: ignored