<a href="https://colab.research.google.com/github/Borg2/Image-captioning/blob/main/RNN_%26_Transfer_Learning_for_Image_Captioning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image Captioning

This is an image captioning system using encoder : VGG16 (CNN) for feature extraction and a decoder :LSTM (RNN) for caption generation on the Flickr8k dataset. This system will describe images in natural language, with a focus on using LSTM (RNN) model for the caption generation phase.

## importing the necessary libraries

In [11]:
import kagglehub
import numpy as np
import cv2
import os
import pandas as pd
import matplotlib.pyplot as plt
from keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from keras.models import Model
from tokenizers import Tokenizer,normalizers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import Lowercase
from tokenizers.processors import TemplateProcessing
import tensorflow as tf
from keras.layers import Dense, LSTM , Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

## dowloading the flickr8k dataset


In [2]:
# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")
print(path)

/root/.cache/kagglehub/datasets/adityajn105/flickr8k/versions/1


## Processing the images and captions

### Getting the images and captions path

In [3]:
images_path = os.path.join(path, "Images")
captions_path = os.path.join(path, "captions.txt")

captions_df = pd.read_csv(captions_path)
print (captions_df.head())

                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


### Putting the captions into a dict with their corresponding image file name

In [4]:
captions = {}
for i, row in captions_df.iterrows():
    if row['image'] not in captions:
        captions[row['image']] = []
    captions[row['image']].append(row['caption'])


### Functions to process an image and the images folder


In [5]:
def process_image (image_path):
  image = cv2.imread(image_path)
  if image is not None:
      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
      image = cv2.resize(image, (224, 224))
      image = preprocess_input(image)
  return image
def process_images_folder(image_path,captions):
  images = []
  captions_dict = {}
  for idx,image_file in enumerate(os.listdir(images_path)):
      image_path = os.path.join(images_path, image_file)
      image = process_image(image_path)
      if image is not None:
          images.append(image)
          captions_dict.setdefault(idx, []).extend(captions[image_file])
  return images,captions_dict

### Obtaining the images and their corresponding captions in order

In [6]:
images,captions_dict = process_images_folder(images_path,captions)
print("number of images:",len(images))
print("number of captions:",len(captions_dict))

number of images: 8091
number of captions: 8091


## Initializing the tokenizer and trainig it on the captions

The code creates a vocabulary for tokenizing captionsusing huggingfaces's tokenizer by using Byte Pair Encoding (BPE), a common subword tokenization technique. It prepares a list of captions with special tokens marking the start and end of sentences. The tokenizer learns a vocabulary based on the frequency of subwords in the dataset. Finally, it prints the size of the generated vocabulary.

In [12]:
# Creating the captions list with <s> as the start token and </s> as the end token
# Replacing periods ('.') in captions with the end token </s> to mark sentence endings
captions_list = ['<s>' + caption.replace('.', '</s>') for captions_group in captions_dict.values() for caption in captions_group]

# Initializing a Byte Pair Encoding (BPE) tokenizer
tokenizer = Tokenizer(BPE())

# Setting the normalizer to convert text to lowercase
tokenizer.normalizer = Lowercase()

# Setting the pre-tokenizer to split text into words based on whitespace
tokenizer.pre_tokenizer = Whitespace()

# Configuring the trainer for the BPE tokenizer
# - min_frequency=4: Tokens appearing less than 4 times will be ignored
# - special_tokens: Adding special tokens for unknown words ([UNK]), padding ([PAD]), and sentence boundaries (<s>, </s>)
trainer = BpeTrainer(min_frequency=4, special_tokens=["[UNK]", "[PAD]", "<s>", "</s>"])

# Training the tokenizer using the captions_list
# The captions_list iterator feeds sentences to the tokenizer for training
tokenizer.train_from_iterator(captions_list, trainer=trainer)

# Getting the vocabulary generated by the tokenizer
vocab = tokenizer.get_vocab()

# Printing the size of the vocabulary (number of unique tokens)
print(len(vocab))

5435


### Testing the tokenizer




In [13]:
print(tokenizer.encode("This is a test @@g,mai hello").ids,tokenizer.encode("This is a Test @@g,mai hello").tokens)

[858, 72, 30, 2625, 36, 11, 333, 38, 4066] ['this', 'is', 'a', 'test', 'g', ',', 'ma', 'i', 'hello']


### Enabling padding

enabling padding so all token sequenses are of the same length

In [13]:
tokenizer.enable_padding(pad_id= tokenizer.token_to_id("[PAD]"),pad_token="[PAD]")

### Tokenizing the captions

In [14]:
captions_tokenized = tokenizer.encode_batch(captions_list)

Extracting the caption ids seperately to able to process them

In [None]:
captions_sequenses = [capt.ids for i,capt in enumerate(captions_tokenized) ]
captions_sequenses = np.array(captions_sequenses)
print(captions_sequenses.shape)


## Initializing the encoder model (VGG16)


Using the VGG16 pretrained model for feature extraction from the images to obtain the image embeddings

In [6]:
model = VGG16(weights='imagenet',include_top=True,input_shape=(224,224,3))
encoder = Model(inputs=model.input, outputs=model.get_layer('fc2').output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [7]:
def extract_features(images,model):
    #images = np.expand_dims(image,axis=0)
    #image = preprocess_input(image)
    features = model.predict(images)
    return features

In [8]:
images = np.array(images)

In [9]:
images.shape


(8091, 224, 224, 3)

In [10]:
batch_size = 32  # Choose a batch size based on your hardware capacity
num_batches = len(images) // batch_size + (len(images) % batch_size != 0)
features=[]
for i in range(num_batches):
  batch_images = images[i * batch_size:(i + 1) * batch_size]
  feat = extract_features(batch_images,encoder)
  features.extend(feat.reshape(feat.shape[0], -1))
features = np.array(features)
print(features.shape)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24

In [34]:
class LSTM_Decoder(tf.keras.Model):
    def __init__(self , embedd_dim ,vocab_size, lstm_units , img_embedd_bottleneck , img_embedd_size):
        super(RNN_Decoder,self).__init__()
        self.embedd_dim = embedd_dim
        self.vocab_size = vocab_size
        self.lstm_units = lstm_units

        self.embedding = Embedding(vocab_size,embedd_dim)
        self.img_embedd_to_bottleneck = Dense (img_embedd_bottleneck,
                                               input_shape = (None,img_embedd_size),
                                               activation = 'elu')
        self.bottleneck_to_h0 = Dense (lstm_units,
                                       input_shape= (None,img_embedd_bottleneck),
                                       activation = 'elu')
        self.lstm = LSTM(lstm_units,return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size,
                        input_shape = (None,lstm_units),
                        activation = 'softmax')

    def call(self , inputs):
        img_embedd , sentence_sequences = inputs

        img_bottleneck = self.img_embedd_to_bottleneck(img_embedd)

        sentence_embeddings = self.embedding(sentence_sequences)

        c0 = h0 = self.bottleneck_to_h0(img_bottleneck)

        #initial_state = tf.nn.rnn_cell.LSTMStateTuple(c0, h0)
        initial_state = [c0, h0]

        lstm_output,_,_= self.lstm(sentence_embeddings,
                                 initial_state = initial_state)

        output = self.fc(lstm_output)

        return output

In [15]:
print("caption before tokenization:",captions_list[0])
print("caption after tokenization:",captions_tokenized[0].tokens)
print("tokenized ids:",captions_tokenized[0].ids)

caption before tokenization: <s>A brown dog is carrying a toy in the snow </s>
caption after tokenization: ['<s>', 'a', 'brown', 'dog', 'is', 'carrying', 'a', 'toy', 'in', 'the', 'snow', '</s>', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
tokenized ids: [2, 30, 159, 78, 72, 465, 30, 361, 56, 62, 163, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [18]:
captions_sequenses = [capt.ids for i,capt in enumerate(captions_tokenized) ]
captions_sequenses = np.array(captions_sequenses)
print(captions_sequenses.shape)

(40455, 42)


In [19]:
features = np.repeat(features, repeats=5, axis=0)
print(len(captions_sequenses))
print(len(features))

40455
40455


In [35]:
EMBEDD_DIM = 100
VOCAB_SIZE = len(vocab)
LSTM_UNITS = 256
IMG_EMBEDD_BOTTLENECK = 512
IMG_EMBEDD_SIZE = features.shape[1]


decoder = RNN_Decoder(EMBEDD_DIM,VOCAB_SIZE,LSTM_UNITS,IMG_EMBEDD_BOTTLENECK,IMG_EMBEDD_SIZE)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
# Split into training and temporary sets (80% train, 20% temp)
train_img, temp_img, train_seq, temp_seq = train_test_split(features, captions_sequenses, test_size=0.2, random_state=42)

# Split temporary set into validation and test (50% each of temp = 10% each of total)
val_img, test_img, val_seq, test_seq = train_test_split(temp_img, temp_seq, test_size=0.5, random_state=42)

In [22]:
def create_dataset(img_features, captions, batch_size=32):
    input_sequences = captions[:, :-1]  # Input sequence (shifted left)
    target_sequences = captions[:, 1:]  # Target sequence (shifted right)

    dataset = tf.data.Dataset.from_tensor_slices(((img_features, input_sequences), target_sequences))
    dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [23]:
# Batch size
BATCH_SIZE = 32

# Create datasets
train_dataset = create_dataset(train_img, train_seq, batch_size=BATCH_SIZE)
val_dataset = create_dataset(val_img, val_seq, batch_size=BATCH_SIZE)
test_dataset = create_dataset(test_img, test_seq, batch_size=BATCH_SIZE)

In [36]:
decoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

decoder.fit(
    train_dataset,
    #sentence_sequence=train_seq[:, :-1],
    validation_data=val_dataset,
    epochs=10
)

Epoch 1/10
(None, 41, 100)
(None, 41, 100)
(None, 41, 100)
[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7469 - loss: 1.8989(None, 41, 100)
[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 21ms/step - accuracy: 0.7470 - loss: 1.8984 - val_accuracy: 0.7902 - val_loss: 1.1790
Epoch 2/10
[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 21ms/step - accuracy: 0.7949 - loss: 1.1385 - val_accuracy: 0.8020 - val_loss: 1.0494
Epoch 3/10
[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 21ms/step - accuracy: 0.8049 - loss: 1.0200 - val_accuracy: 0.8079 - val_loss: 0.9903
Epoch 4/10
[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.8109 - loss: 0.9527 - val_accuracy: 0.8116 - val_loss: 0.9563
Epoch 5/10
[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 21ms/step - accuracy: 0.8148 - loss: 0.9049 - val_accuracy: 0.8141 - val_loss: 0.

<keras.src.callbacks.history.History at 0x7c02c132c2e0>

In [41]:
def predict_sequence(decoder, img_embedd, max_length, start_token=2):
    # Initialize the sequence with the start token
    sentence_sequence = np.array([[start_token]], dtype=np.int32)

    for _ in range(max_length):
        # Predict next token probabilities
        predictions = decoder([img_embedd, sentence_sequence])

        # Take the token with the highest probability
        next_token = tf.argmax(predictions[:, -1, :], axis=-1)

        # Append the predicted token to the sequence
        sentence_sequence = np.append(sentence_sequence, [[next_token[0]]], axis=1)

        # Stop if end token is generated (if defined, e.g., END_TOKEN = 2)
        if next_token[0] == 3:  # Replace with your end token index
            break

    return sentence_sequence[0]

In [42]:
image = cv2.imread('/content/Football_in_Bloomington,_Indiana,_1995.jpg')
if image is not None:
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (224, 224))
    image = preprocess_input(image)

image_features = extract_features(np.expand_dims(image, axis=0), feat_extractor)
sentence_sequence = predict_sequence(decoder, image_features, max_length=41)
print(tokenizer.decode(sentence_sequence))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
(1, 1, 100)
(1, 2, 100)
(1, 3, 100)
(1, 4, 100)
(1, 5, 100)
(1, 6, 100)
(1, 7, 100)
(1, 8, 100)
(1, 9, 100)
(1, 10, 100)
a man in a white uniform is playing soccer
