<a href="https://colab.research.google.com/github/Borg2/Image-captioning/blob/main/RNN_%26_Transfer_Learning_for_Image_Captioning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image Captioning

This is an image captioning system using VGG16 (CNN) for feature extraction and a simple Recurrent Neural Network (RNN) for caption generation on the Flickr8k dataset. This system will describe images in natural language, with a focus on using a basic RNN model for the caption generation phase.

## importing the necessary libraries

In [1]:
import kagglehub
import numpy as np
import cv2
import os
import pandas as pd
import matplotlib.pyplot as plt
from keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from keras.models import Model
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import Lowercase
from tokenizers.processors import TemplateProcessing

## dowloading the flickr8k dataset


In [2]:
# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")
print(path)

/root/.cache/kagglehub/datasets/adityajn105/flickr8k/versions/1


In [3]:
images_path = os.path.join(path, "Images")
captions_path = os.path.join(path, "captions.txt")

captions_df = pd.read_csv(captions_path)
print (captions_df.head())

                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


In [4]:
captions = {}
for i, row in captions_df.iterrows():
    if row['image'] not in captions:
        captions[row['image']] = []
    captions[row['image']].append(row['caption'])


In [5]:
images = []
captions_dict = {}
for idx,image_file in enumerate(os.listdir(images_path)):
    image_path = os.path.join(images_path, image_file)
    image = cv2.imread(image_path)
    if image is not None:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (224, 224))
        image = preprocess_input(image)
        images.append(image)
        captions_dict.setdefault(idx, []).extend(captions[image_file])

print(len(images))
print(len(captions))


8091
8091


In [6]:
model = VGG16(weights='imagenet',include_top=True,input_shape=(224,224,3))
feat_extractor = Model(inputs=model.input, outputs=model.get_layer('fc2').output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 0us/step


In [7]:
def extract_features(images,model):
    #images = np.expand_dims(image,axis=0)
    #image = preprocess_input(image)
    features = model.predict(images)
    return features

In [8]:
images = np.array(images)

In [9]:
images.shape


(8091, 224, 224, 3)

In [10]:
batch_size = 32  # Choose a batch size based on your hardware capacity
num_batches = len(images) // batch_size + (len(images) % batch_size != 0)
features=[]
for i in range(num_batches):
  batch_images = images[i * batch_size:(i + 1) * batch_size]
  feat = extract_features(batch_images,feat_extractor)
  features.extend(feat.reshape(feat.shape[0], -1))
features = np.array(features)
print(features.shape)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38

In [11]:
#Creating the captions list and adding start and end tokens (<s>,</s>)
captions_list = ['<s>' + caption.replace('.','</s>') for captions_group in captions_dict.values() for caption in captions_group]

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(min_frequency=4,special_tokens=["[UNK]", "[PAD]","<s>","</s>"])
tokenizer.train_from_iterator(captions_list, trainer=trainer)
vocab = tokenizer.get_vocab()
print(len(vocab))

5435


In [12]:
print(tokenizer.encode("This is a test @@g,mai hello").ids,tokenizer.encode("This is a Test @@g,mai hello").tokens)

[858, 72, 30, 2625, 36, 11, 333, 38, 4066] ['this', 'is', 'a', 'test', 'g', ',', 'ma', 'i', 'hello']


In [13]:
#enabling padding so all token sequenses are of the same length
tokenizer.enable_padding(pad_id= tokenizer.token_to_id("[PAD]"),pad_token="[PAD]")

In [14]:
captions_tokenized = tokenizer.encode_batch(captions_list)

In [15]:
print("caption before tokenization:",captions_list[0])
print("caption after tokenization:",captions_tokenized[0].tokens)
print("tokenized ids:",captions_tokenized[0].ids)

caption before tokenization: <s>A dog jumps out of a creek </s>
caption after tokenization: ['<s>', 'a', 'dog', 'jumps', 'out', 'of', 'a', 'creek', '</s>', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
tokenized ids: [2, 30, 78, 277, 216, 85, 30, 1665, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [16]:
import tensorflow as tf
from keras.layers import Dense, LSTM , Embedding
from sklearn.model_selection import train_test_split

In [42]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self , embedd_dim ,vocab_size, lstm_units , img_embedd_bottleneck , img_embedd_size):
        super(RNN_Decoder,self).__init__()
        self.embedd_dim = embedd_dim
        self.vocab_size = vocab_size
        self.lstm_units = lstm_units

        self.embedding = Embedding(vocab_size,embedd_dim)
        self.img_embedd_to_bottleneck = Dense (img_embedd_bottleneck,
                                               input_shape = (None,img_embedd_size),
                                               activation = 'elu')
        self.bottleneck_to_h0 = Dense (lstm_units,
                                       input_shape= (None,img_embedd_bottleneck),
                                       activation = 'elu')
        self.lstm = LSTM(lstm_units)
        self.fc = Dense(vocab_size,
                        input_shape = (None,lstm_units),
                        activation = 'softmax')

    def call(self , inputs):
        img_embedd , sentence_sequences = inputs

        img_bottleneck = self.img_embedd_to_bottleneck(img_embedd)

        sentence_embeddings = self.embedding(sentence_sequences)

        c0 = h0 = self.bottleneck_to_h0(img_bottleneck)

        initial_state = tf.nn.rnn_cell.LSTMStateTuple(c0, h0)

        lstm_output,_= self.lstm(sentence_embeddings,
                                 initial_state = initial_state)

        output = self.fc(lstm_output)

        return output






In [43]:
captions_sequenses = [capt.ids for i,capt in enumerate(captions_tokenized) ]
captions_sequenses = np.array(captions_sequenses)
print(captions_sequenses.shape)

(40455, 42)


In [None]:
features = np.repeat(features, repeats=5, axis=0)
print(len(captions_sequenses))
print(len(features))

In [40]:
EMBEDD_DIM = 100
VOCAB_SIZE = len(vocab)
LSTM_UNITS = 256
IMG_EMBEDD_BOTTLENECK = 512
IMG_EMBEDD_SIZE = features.shape[1]


decoder = RNN_Decoder(EMBEDD_DIM,VOCAB_SIZE,LSTM_UNITS,IMG_EMBEDD_BOTTLENECK,IMG_EMBEDD_SIZE)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
# Split into training and temporary sets (80% train, 20% temp)
train_img, temp_img, train_seq, temp_seq = train_test_split(features, captions_sequenses, test_size=0.2, random_state=42)

# Split temporary set into validation and test (50% each of temp = 10% each of total)
val_img, test_img, val_seq, test_seq = train_test_split(temp_img, temp_seq, test_size=0.5, random_state=42)

In [23]:
def create_dataset(img_features, captions, batch_size=32):
    input_sequences = captions[:, :-1]  # Input sequence (shifted left)
    target_sequences = captions[:, 1:]  # Target sequence (shifted right)

    dataset = tf.data.Dataset.from_tensor_slices(((img_features, input_sequences), target_sequences))
    dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [24]:
# Batch size
BATCH_SIZE = 32

# Create datasets
train_dataset = create_dataset(train_img, train_seq, batch_size=BATCH_SIZE)
val_dataset = create_dataset(val_img, val_seq, batch_size=BATCH_SIZE)
test_dataset = create_dataset(test_img, test_seq, batch_size=BATCH_SIZE)

In [41]:
decoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

decoder.fit(
    train_dataset,
    #sentence_sequence=train_seq[:, :-1],
    validation_data=val_dataset,
    epochs=10
)

Epoch 1/10


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''module 'keras._tf_keras.keras.layers' has no attribute 'LSTMStateTuple'''


AttributeError: Exception encountered when calling RNN_Decoder.call().

[1mmodule 'keras._tf_keras.keras.layers' has no attribute 'LSTMStateTuple'[0m

Arguments received by RNN_Decoder.call():
  • inputs=('tf.Tensor(shape=(None, 4096), dtype=float32)', 'tf.Tensor(shape=(None, 41), dtype=int64)')