<a href="https://colab.research.google.com/github/Elghandour-eng/Arabic_Image_Captioning/blob/main/Image_captioning_decoder%26encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **`Image Captioning`**

### 1- Importing Libraries

In [14]:
! pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import pandas as pd     # for data manipulation and analysis
import numpy as np      # for scientific computing
import matplotlib.pyplot as plt     # for plotting graphs
import seaborn as sns     # for plotting graphs
import tensorflow as tf   #modeling
from tensorflow import keras  # modeling
import os # for managing directories and paths
import re # import regex
from tensorflow.keras.preprocessing.text import Tokenizer # import tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences # import pad_sequences

### 2- Load Data

In [2]:
captions_folders = '/flicker8k_text'

if not os.path.exists(os.path.abspath('.') + captions_folders):
    caption_zip = tf.keras.utils.get_file('Flickr8k_text.zip',
                                            cache_subdir=os.path.abspath('.'), 
                                            origin = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip',
                                            extract = True)
    os.remove(caption_zip) #remove the zipfile since we have extracted it
    
image_folders = '/flicker8k_Dataset'

if not os.path.exists(os.path.abspath('.') + image_folders):
    image_zip = tf.keras.utils.get_file('Flickr8k_Dataset.zip',
                                            cache_subdir=os.path.abspath('.'), 
                                            origin = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip',
                                            extract = True)
    os.remove(image_zip) #remove the zipfile since we have extracted it

else:
    path = os.path.abspath('.') + image_folders
    

Downloading data from https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
Downloading data from https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip


In [3]:
class LoadData:
    def load_text_doc(self ,file_path):
        with open(file_path) as f:
            captions = f.read()
            return captions
    
    def image_caption_dict(self ,text):
        caption_mappings = {}
        lines = text.split('\n')
        
        for line in lines :
            line_split = line.split('\t') # split the line by tab delimiter (Tokenize)
            if len(line_split) < 2: # skip the lines with less than 2 elements
                continue
            else :
                image_meta , caption = line_split
            raw_image_name , caption_number = image_meta.split('#') # split the image name and caption number
            image_name = raw_image_name.split('.')[0] # remove the .jpg extension from the image name
            
            if (int(caption_number) == 0):
                caption_mappings[image_name] = [caption] # create a new list for the image name
            else :
                caption_mappings[image_name].append(caption)
            
        return caption_mappings
    
    def train_img_names(self ,file_path):
        data = []
        with open(file_path) as f:
            text = f.read()
            lines = text.split('\n')
            
            for line in lines :
                if len(line) < 1:
                    continue
                image_name = line.split('.')[0]
                data.append(image_name)
        return (data) # return a list of image names

In [5]:
data_loader = LoadData()

dataset_dir = '/content/Flicker8k_Dataset'
captions_file_path = '/content/Flickr8k.token.txt'
train_file_path = '/content/Flickr_8k.trainImages.txt'


captions_txt = data_loader.load_text_doc(captions_file_path)
image_caption_dict = data_loader.image_caption_dict(captions_txt)
train_img_names = data_loader.train_img_names(train_file_path)

### 3- Data Preprocessing

In [6]:
class PreProcessimages :
    def load_image(self, path):
        image = tf.io.read_file(path) # read the image from the path
        image = tf.image.decode_jpeg(image, channels=3) # decode the image to a tensor
        image = tf.image.resize(image, (299, 299)) # resize the image to the required size
        image = tf.keras.applications.inception_v3.preprocess_input(image) # preprocess the image
        
        return image, path
    
    def apply_inception_v3(self, ds_dir , train_img_names):
        
        from tqdm import tqdm # tqdm for progress bar
        
        model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
        new_input = model.input # get the input layer
        hidden_layer = model.layers[-1].output # get the output layer
        image_features_extract_model = tf.keras.Model(new_input, hidden_layer) # create a new model
        trainig_img_path = [ ds_dir + '/'+ img_name + '.jpg' for img_name in train_img_names] # create a list of image paths
        encoded_training_data = sorted(set(trainig_img_path)) # remove duplicates
        img_ds = tf.data.Dataset.from_tensor_slices(encoded_training_data) # create a dataset from the list of image paths
        img_ds = img_ds.map(self.load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(14) # load the images in batches
        
        
        for img, path in tqdm(img_ds):
            batch_features = image_features_extract_model(img)
            batch_features = tf.reshape(batch_features, 
                                        (batch_features.shape[0], -1, batch_features.shape[3])) # reshape the features to (batch_size, 64, 2048)
        
        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy()) # save the features to a file

In [7]:
img_preprocess = PreProcessimages()
img_preprocess.apply_inception_v3(dataset_dir , train_img_names)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


100%|██████████| 429/429 [01:35<00:00,  4.50it/s]


In [24]:
import re # import regex

class PrepocessCaptions:
    
    def clean_captions_text(self ,parsed_img_captions_dict):
        
        for key , captions_text in parsed_img_captions_dict.items():
            for i ,caption in enumerate(captions_text):
                caption_nopunct = re.sub(r'[^a-zA-z0-9]','',caption.lower()) # remove punctuations
                clean_words = [word for word in caption_nopunct.split() if ((len(word)>1) and (word.isalpha()))] # remove single letter words
    
    
    def add_token(self, captions):
        for i, caption in enumerate(captions):
            captions[i] = 'startseq ' + caption + ' endseq'
        return (captions)
    
    
    def img_subset_data_dict(self, img_dict, img_names):
        data_dict = { img_name : self.add_token(captions) for img_name, captions
                     in img_dict.items() if img_name in img_names}
        return (data_dict)
    
    
    def flat_caps(self, data_dict):
       return { caption for key, captions in data_dict.items() for caption in captions}
   
   
    def caps_max_word_len(self, caps):
        return max(len(caption.split()) for caption in caps)
    
    
    def apply_tokenizer(self, data_dict):
        caps = self.flat_caps(data_dict)
        max_cap_len = self.caps_max_word_len(caps)
        
        
        tokenzier = Tokenizer()
        tokenzier.fit_on_texts(caps)
        vocab_size = len(tokenzier.word_index) + 1
        
        return (tokenzier , vocab_size , max_cap_len)
    
    
    def pad_text(self, parsed_img_caps_dict, max_length):
        paddeed_cap_text = pad_sequences([parsed_img_caps_dict], maxlen=max_length, padding='post')[0]
        
        return (paddeed_cap_text)
    
    def data_prep(self, data_dict, img_dir ,tokenizer, max_len, vocab_size):
        X, y = list(), list()
        
        for image_name, captions in data_dict.items():
            iamge_name = img_dir + image_name + '.jpg' 
            for caption in captions:
                word_idxs = tokenizer.texts_to_sequences([caption])[0]
                pad_idxs = self.pad_text(word_idxs, max_len)
                X.append(image_name)
                y.append(pad_idxs)
                
        return X, y 

In [26]:
caps_preprocess = PrepocessCaptions() # create an instance of the class

caps_preprocess.clean_captions_text(image_caption_dict)  # clean the captions text
data_dict_train = caps_preprocess.img_subset_data_dict(image_caption_dict, train_img_names) # create a dictionary of image names and their captions
tokenizer, vocab_size, max_length = caps_preprocess.apply_tokenizer(data_dict_train) # apply the tokenizer on the training data
train_X, train_y = caps_preprocess.data_prep(data_dict_train, dataset_dir, tokenizer, max_length, vocab_size) # prepare the training data

In [30]:
def npy_loader(img_name, captions):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor, captions

def prepare_tf_dataset(Buffer_size, Batch_size, X_train, y_train):
    dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
                npy_loader, [item1, item2], [tf.float32, tf.int32]),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(Buffer_size).batch(Batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset