<a href="https://colab.research.google.com/github/Delaunay-I/image_cap_generator/blob/main/cap_gen_v3_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
## for google colab runs
%cd /content/drive/MyDrive/colab\ files
!pwd

/content/drive/MyDrive/colab files
/content/drive/MyDrive/colab files


In [47]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
import keras
import sys, time, os, warnings
import numpy as np
import pandas as pd
from collections import Counter

print("python {}".format(sys.version))
print("keras version {}".format(keras.__version__)); del keras
print("tensorflow version {}".format(tf.__version__))

def set_seed(sd=123):
    from numpy.random import seed
    from tensorflow import set_random_seed
    import random as rn
    ## numpy random seed
    seed(sd)
    ## core python's random number
    rn.seed(sd)
    ## tensor flow's random number
    set_random_seed(sd)

python 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0]
keras version 2.12.0
tensorflow version 2.12.0


In [48]:
# Desired image dimensions
IMAGE_SIZE = (75, 75)

# Vocabulary size
VOCAB_SIZE = 4000

# Fixed length allowed for any sequence
SEQ_LENGTH = 25

# Dimension for the image embeddings and token embeddings
EMBED_DIM = 512

# Per-layer units in the feed-forward network
FF_DIM = 512

# Other training parameters
BATCH_SIZE = 2
SHUFFLE_DIM = 1
EPOCHS = 30
AUTOTUNE = tf.data.AUTOTUNE

In [49]:
dir_Flickr_jpg = "/content/drive/MyDrive/colab files/flickr8k/Images"

dir_Flickr_text = "/content/drive/MyDrive/colab files/flickr8k/captions.txt"

jpgs = os.listdir(dir_Flickr_jpg)

df_txt = pd.read_csv(dir_Flickr_text, skiprows=1, names=["filename", "caption"])
df_txt['caption'] = df_txt['caption'].str.lower()

df_txt['index'] = df_txt.groupby("filename").cumcount()

uni_filenames = np.unique(df_txt.filename.values)

In [50]:
import string

# Remove punctuations..
def remove_punctuation(text_original):
    text_no_punctuation = text_original.translate(str.maketrans('', '', string.punctuation))
    return(text_no_punctuation)

# Remove a single character word..
def remove_single_character(text):
    text_len_more_than1 = ""
    for word in text.split():
        if len(word) > 1:
            text_len_more_than1 += " " + word
    return(text_len_more_than1)

# Remove words with numeric values..
def remove_numeric(text,printTF=False):
    text_no_numeric = ""
    for word in text.split():
        isalpha = word.isalpha()
        if printTF:
            print("    {:10} : {:}".format(word,isalpha))
        if isalpha:
            text_no_numeric += " " + word
    return(text_no_numeric)

In [51]:
def text_clean(text_original):
    text = remove_punctuation(text_original)
    text = remove_single_character(text)
    text = remove_numeric(text)
    return(text)


for i, caption in enumerate(df_txt.caption.values):
    newcaption = text_clean(caption)
    df_txt["caption"].iloc[i] = newcaption

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_txt["caption"].iloc[i] = newcaption


In [52]:
def df_word(df_txt):
    vocabulary = []
    for txt in df_txt.caption.values:
        vocabulary.extend(txt.split())
    print('Vocabulary Size: %d' % len(set(vocabulary)))
    ct = Counter(vocabulary)
    dfword = pd.DataFrame({"word":ct.keys(),"count":ct.values()})
    dfword = dfword.sort_values(by="count",ascending=False)
    dfword = dfword.reset_index()[["word","count"]]
    return(dfword)
dfword = df_word(df_txt)

Vocabulary Size: 8763


# Data prepration
prepare text and image separately

In [53]:
from copy import copy
def add_start_end_seq_token(captions):
    caps = []
    for txt in captions:
        txt = 'startseq ' + txt + ' endseq'
        caps.append(txt)
    return(caps)

df_txt["caption"] = add_start_end_seq_token(df_txt["caption"])

# split the dataset int train and test splits

In [54]:
from sklearn.model_selection import train_test_split

# create a list of unique image file names in your DataFrame (df_txt) using the unique method of pandas:
unique_files = df_txt['filename'].unique()

# Split the list of unique file names into train and test sets using the train_test_split function from scikit-learn:
train_files, test_files = train_test_split(unique_files, test_size=0.2, random_state=42)

# Filter the original DataFrame to include only the rows corresponding to the image files in the train and test sets:
train_df = df_txt[df_txt['filename'].isin(train_files)]
test_df = df_txt[df_txt['filename'].isin(test_files)]

# Verify that there is no leakage by checking if there are any image file names that appear in both the train and test sets:
assert len(set(train_df['filename']).intersection(set(test_df['filename']))) == 0
assert train_df.shape[0]/5 == train_df.filename.unique().size
assert test_df.shape[0]/5 == test_df.filename.unique().size

In [55]:
grouped = train_df.groupby('filename')['caption'].apply(list)
train_cap_list = [captions for captions in grouped]

grouped = test_df.groupby('filename')['caption'].apply(list)
test_cap_list = [captions for captions in grouped]

# Image prepration
## create features for image using InceptionV3 model

In [56]:
import glob
import pickle

train_path = dir_Flickr_jpg
path_all_images = glob.glob(train_path + '/*jpg')

train_img = []  # list of all images in training set
test_img = []
for im in path_all_images:
    file_name = os.path.basename(os.path.normpath(im))
    # include images that only exist in the target directory
    # can split the dataset this way
    if(file_name in train_df.filename.to_list()):
        train_img.append(im)
    elif (file_name in test_df.filename.to_list()):
        test_img.append(im)
    else:
        print(f"{file_name} not in the directory")

Preprocess image

In [57]:
from tensorflow.keras.utils import load_img, img_to_array
from keras.applications.inception_v3 import preprocess_input

def read_image():
    def decode_image(image_path):
        img = tf.io.read_file(image_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, (75, 75))
        img = tf.image.convert_image_dtype(img, tf.float32)

        return img

    return decode_image

# Tokenize the captions

In [58]:
from tensorflow.keras.layers import TextVectorization

tokenizer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LENGTH,
)

tokenizer.adapt(train_df.caption.to_list())
VOCAB_SIZE = len(tokenizer.get_vocabulary())

## Make dataset

In [59]:
def make_dataset(image_path, captions, tokenizer):
    read_image_fx = read_image()
    img_dataset = tf.data.Dataset.from_tensor_slices(image_path)

    img_dataset = (img_dataset
                   .map(read_image_fx, num_parallel_calls=AUTOTUNE))

    cap_dataset = tf.data.Dataset.from_tensor_slices(captions).map(tokenizer, num_parallel_calls=AUTOTUNE)

    dataset = tf.data.Dataset.zip((img_dataset, cap_dataset))
    dataset = dataset.batch(BATCH_SIZE).shuffle(SHUFFLE_DIM).prefetch(AUTOTUNE)
    return dataset

train_dataset = make_dataset(train_img, train_cap_list, tokenizer=tokenizer)
valid_dataset = make_dataset(test_img, test_cap_list, tokenizer=tokenizer)

In [60]:
next(iter(train_dataset))[0].shape

TensorShape([2, 75, 75, 3])

In [61]:
img, txt = next(iter(train_dataset))


# Define the model

cnn model

In [62]:
from keras.applications import inception_v3
from tensorflow import keras
from tensorflow.keras import layers

def get_cnn_model():
    base_model = inception_v3.InceptionV3(
        input_shape=(*IMAGE_SIZE, 3),
        include_top=False,
        weights='imagenet'
        )

    # Freeze feature extractor layers
    base_model.trainable = False
    base_model_out = base_model.output
    base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out) # shape: (batch_size, 1, max_image_feats=2048 for inceptionV3)
    cnn_model = keras.models.Model(base_model.input, base_model_out)

    return cnn_model

define transformer components

In [18]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_length, vocab_size, d_model):
        super().__init__()
        self.pos_embedding = tf.keras.layers.Embedding(input_dim=max_length, output_dim=d_model)

        self.token_embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=d_model,
            mask_zero=True)

        self.add = tf.keras.layers.Add()

    def call(self, seq):
        seq = self.token_embedding(seq) # (batch, seq, d_model)

        x = tf.range(tf.shape(seq)[1])  # (seq)
        x = x[tf.newaxis, :]  # (1, seq)
        x = self.pos_embedding(x)  # (1, seq, d_model)

        return self.add([seq,x])

Attention sub-layers

In [19]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x


class GlobalSelfAttention(BaseAttention):
    def call(self, x, training, mask=None, **kwargs):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            attention_mask=None,
            training=training,
            )
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x


class CrossAttention(BaseAttention):
    def call(self, x, context, **kwargs):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x

feed-forward sub-layer

In [20]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(embed_dim),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

## Encoder Block
Encoder layer

In [38]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, embed_dim, dense_dim, num_heads,
                dropout_rate=0.1):
        super().__init__()

        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.layernorm_1 = layers.LayerNormalization()
        self.dense_proj = layers.Dense(embed_dim, activation="relu")

        self.enc_layers = [
            GlobalSelfAttention(
                num_heads=num_heads, key_dim=embed_dim, dropout=dropout_rate
                )
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs):
        # `inputs` is token-IDs shape: (batch, 1, seq_len)
        inputs = self.layernorm_1(inputs)
        inputs = self.dense_proj(inputs)
        # Add dropout.
        x = self.dropout(inputs)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # Shape `(batch_size, 1, embed_dim)`.


Transformer decoder

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, units, num_heads=1, dropout_rate=0.1):
    super().__init__()

    self.self_attention = CausalSelfAttention(num_heads=num_heads,
                                              key_dim=units,
                                              dropout=dropout_rate)
    self.cross_attention = CrossAttention(num_heads=num_heads,
                                          key_dim=units,
                                          dropout=dropout_rate)
    self.ff = FeedForward(units=units, dropout_rate=dropout_rate)


  def call(self, inputs, training=False):
    in_seq, out_seq = inputs

    # Text input
    out_seq = self.self_attention(out_seq)

    out_seq = self.cross_attention(out_seq, in_seq)

    self.last_attention_scores = self.cross_attention.last_attention_scores

    out_seq = self.ff(out_seq)

    return out_seq

class TransformerDecoderBlock(layers.Layer):

# Evaluate the model

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate a description for an image
def generate_desc(model, tokenizer, image_feats, max_length):
    image_feats = image_feats.reshape(1,-1)
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        # predict next word
        yhat = model.predict([image_feats, sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

## BLEU Score

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# evaluate the skill of the model
def evaluate_model(model, df, image_data, tokenizer, max_length):
    with mlflow.start_run():

        actual, predicted = list(), list()
        count = 0
        # step over the whole set
        for key, image_feats in image_data.items():
            count += 1
            if count % 200 == 0:
                print("  {:4.2f}% is done..".format(100*count/float(df.shape[0]/5)))
            # generate description
            yhat = generate_desc(model, tokenizer, image_feats, max_length)
            # append all the captions of a image file to a list
            caption_list = list()
            for desc in test_df.loc[df["filename"] == key, "caption"]:
                caption_list.append(desc)
            # store actual and predicted
            references = [d.split() for d in caption_list]
            actual.append(references)
            predicted.append(yhat.split())
        # calculate BLEU score
        bleu1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
        bleu2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
        bleu3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
        bleu4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
        print('BLEU-1: %f' % bleu1)
        print('BLEU-2: %f' % bleu2)
        print('BLEU-3: %f' % bleu3)
        print('BLEU-4: %f' % bleu4)

        mlflow.log_metric("BLEU-1", bleu1)
        mlflow.log_metric("BLEU-2", bleu2)
        mlflow.log_metric("BLEU-3", bleu3)
        mlflow.log_metric("BLEU-4", bleu4)

In [None]:
evaluate_model(model, test_df, test_img_feats, tokenizer, max_length)

  12.35% is done..
  24.71% is done..
  37.06% is done..
  49.41% is done..
  61.77% is done..
  74.12% is done..
  86.47% is done..
  98.83% is done..
