<a href="https://colab.research.google.com/github/AswinTony2001/FinalYearProject/blob/main/Caption_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from pathlib import Path
import numpy as np

from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def get_pretrained_inceptionV3():
    model = InceptionV3(weights='imagenet')
    model2 = Model(model.input, model.layers[-2].output)
    return model2

**Import Libraries**

In [None]:
#@title import libraries
from PIL import Image
    
import json
import os, shutil
import random
import os
from pathlib import Path
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
! git clone https://huggingface.co/spaces/NicolasVana/image-captioning.git

Cloning into 'image-captioning'...
remote: Enumerating objects: 270, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (138/138), done.[K
remote: Total 270 (delta 13), reused 155 (delta 13), pack-reused 115[K
Receiving objects: 100% (270/270), 12.71 MiB | 12.18 MiB/s, done.
Resolving deltas: 100% (14/14), done.
Filtering content: 100% (9/9), 319.66 MiB | 19.69 MiB/s, done.


**Models Definition**

In [None]:

root = Path('image-captioning')
aux_pre = root / 'Inception' / 'PretrainedInceptionLSTM'
aux_re = root / 'Inception' / 'RetrainedInceptionLSTM'

model_re_path = root / 'Inception' / 'RetrainedInceptionLSTM' / 'Model'
model_inception_path = root / 'Inception' / 'RetrainedInceptionFeatureExtraction' / 'Model'
model_pre_path = root / 'Inception' / 'PretrainedInceptionLSTM' / 'Model'



# Must create

def get_pretrained_inceptionV3():
    model = InceptionV3(weights='imagenet')
    model2 = Model(model.input, model.layers[-2].output)
    return model2

def fetch_auxiliary_files(type):
    if type == 'Pretrained Inception':
        word2Index = np.load(aux_pre / "word2Index.npy", allow_pickle=True).item()
        index2Word = np.load(aux_pre / "index2Word.npy", allow_pickle=True).item()
        variable_params = np.load(aux_pre / "variable_params.npy", allow_pickle=True).item()
        return word2Index, index2Word, variable_params
    if type == 'Retrained Inception':
        word2Index = np.load(aux_re / "word2Index.npy", allow_pickle=True).item()
        index2Word = np.load(aux_re / "index2Word.npy", allow_pickle=True).item()
        variable_params = np.load(aux_re / "variable_params.npy", allow_pickle=True).item()
        return word2Index, index2Word, variable_params

def fetch_model(type):
    if type == 'Pretrained Inception':
        model_pre = tf.keras.models.load_model(model_pre_path)
        model_inc = get_pretrained_inceptionV3()
        return model_inc, model_pre
    if type == 'Retrained Inception':
        model_re = tf.keras.models.load_model(model_re_path)
        model_inc = tf.keras.models.load_model(model_inception_path)
        return model_inc, model_re

def preprocess_image_inception(image):
    if image.mode != "RGB":
        image = image.convert(mode="RGB")

    x = np.array(image)
    x = np.expand_dims(x, axis = 0)
    x = preprocess_input(x)
    x = x.reshape(1, 299, 299, 3)

    return x

def extract_features(model, image):
    features = model.predict(image, verbose = 0)
    return features

def generate_caption(model, features, max_len, word2Index, index2Word, beam_index = 3):
    caption = beam_search(model, features, max_len, word2Index, index2Word, beam_index)
    return caption

def beam_search(model, features, max_len, word2Index, index2Word, beam_index):
    start = [word2Index["startseq"]]
    start_word = [[start, 1]]

    final_preds = []
    live_seqs = beam_index
    features = np.tile(features, (beam_index,1))
    count = 0
    while len(start_word) > 0:
        #print(count)
        count+=1
        temp = []
        padded_seqs = []
        #Get padded seqs for each of the starting seqs so far, misnamed as start_word
        for s in start_word:
            par_caps = pad_sequences([s[0]], maxlen=max_len, padding='post')
            padded_seqs.append(par_caps)

        #Formatting input so that it can be used for a prediction
        padded_seqs = np.array(padded_seqs).reshape(len(start_word), max_len)

        preds = model.predict([features[:len(start_word)],padded_seqs], verbose=0)

        #Getting the best branches for each of the start seqs that we had
        for index, pred in enumerate(preds):
            word_preds = np.argsort(pred)[-live_seqs:]
            for w in word_preds:
                next_cap, prob = start_word[index][0][:], start_word[index][1]
                next_cap.append(w)
                prob *= pred[w]
                temp.append([next_cap, prob])

        start_word = temp
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words from all branches
        start_word = start_word[-live_seqs:]

        for pair in start_word:
            if index2Word[pair[0][-1]] == 'endseq':
                final_preds.append([pair[0][:-1], pair[1]])
                start_word = start_word[:-1]
                live_seqs -= 1
            if len(pair[0]) == max_len:
                final_preds.append(pair)
                start_word = start_word[:-1]
                live_seqs -= 1

    # Between all the finished sequences (either max len or predicted endseq), decide which is best
    max_prob = 0
    for index, pred in enumerate(final_preds):
        if pred[1] > max_prob:
            best_index = index
            max_prob = pred[1]

    # Convert to readable text
    final_pred = final_preds[best_index]
    final_caption = [index2Word[i] for i in final_pred[0]]
    final_caption = ' '.join(final_caption[1:])
    return final_caption

# # create target model directory
# model_dir = './models/'
# os.makedirs(model_dir, exist_ok=True)
#
# files_to_download = [
#     "config.json",
#     "flax_model.msgpack",
#     "merges.txt",
#     "special_tokens_map.json",
#     "tokenizer.json",
#     "tokenizer_config.json",
#     "vocab.json",
#     "preprocessor_config.json",
# ]

def _compile():

    image_path = 'image-captioning/samples/ROCO_00929.jpg'
    image = Image.open(image_path)
    #predict(image)
    image.close()


_compile()


sample_dir = 'image-captioning/samples/'
sample_image_ids = tuple(["None"] + [int(f.replace('ROCO_', '').replace('.jpg', '')) for f in os.listdir(sample_dir) if f.startswith('ROCO_')])

with open(os.path.join(sample_dir, "Roco-img-ids.json"), "r", encoding="UTF-8") as fp:
    roco_image_ids = json.load(fp)


def get_random_image_id():

    image_id = random.sample(roco_image_ids, k=1)[0]
    return image_id

In [None]:
model_type = 'Pretrained Inception' # 'Retrained Inception'

inception, lstm = fetch_model(model_type)
word2Index, index2Word, variable_params = fetch_auxiliary_files(model_type)
max_len = variable_params['max_caption_len']


sample_name = "ROCO_00001.jpg"
sample_dir = "/content/image-captioning/samples/"
sample_path = os.path.join(sample_dir, sample_name)
image = Image.open(sample_path)

width, height = 299, 299
resized = image.resize(size=(width, height))

preprocessed_img = preprocess_image_inception(resized)
features = extract_features(inception, preprocessed_img)
caption = generate_caption(lstm, features, max_len, word2Index, index2Word)



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
inception.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 149, 149, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 batch_normalization (BatchNorm  (None, 149, 149, 32  96         ['conv2d[0][0]']                 
 alization)                     )                                                             

In [None]:
a

In [None]:
lstm.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 captions (InputLayer)          [(None, 341)]        0           []                               
                                                                                                  
 features (InputLayer)          [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 341, 300)     1341300     ['captions[0][0]']               
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['features[0][0]']               
                                                                                            

In [None]:
def upload_files():
  from google.colab import files
  uploaded = files.upload()
  for k, v in uploaded.items():
    open(k, 'wb').write(v)
  return list(uploaded.keys())
files = upload_files()

Saving ImageCLEFmedCaption_2022_train_065404.jpg to ImageCLEFmedCaption_2022_train_065404.jpg


In [None]:
def predict(img_path):
  sample_path = img_path
  image = Image.open(sample_path)
  width, height = 299, 299
  resized = image.resize(size=(width, height))

  preprocessed_img = preprocess_image_inception(resized)
  features = extract_features(inception, preprocessed_img)
  caption = generate_caption(lstm, features, max_len, word2Index, index2Word)
  return caption

In [None]:
pred = predict('ImageCLEFmedCaption_2022_train_065404.jpg')
ref = 'brain and carotid artery magnetic resonance angiography show that the main vessel be normal'
print('Original Caption --> ', ref)
print('Predicted Caption -->', pred)

Original Caption -->  brain and carotid artery magnetic resonance angiography show that the main vessel be normal
Predicted Caption -->  computed tomography scan of the abdomen and pelvis showing a large mass in the right kidney


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.translate.bleu_score import SmoothingFunction
def calculateBLEU(Actual_Answer,Generated_Answer):
        pattern = re.compile(r'\s+')
        # NLTK
        # Download Punkt tokenizer (for word_tokenize method)
        # Download stopwords (for stopword removal)
        try:
            nltk.data.find('tokenizers/punkt')
            stops = set(stopwords.words("english"))
        except LookupError:
            nltk.download('punkt')
            nltk.download('stopwords')
            stops = set(stopwords.words("english"))
        # Stemming
        stemmer = SnowballStemmer("english")

        # Remove punctuation from string
        translator = str.maketrans(',-()', '    ')

        # Define max score and current score
        max_score = len(Actual_Answer)
        current_score = 0.0

        for i in range(len(Actual_Answer)):
            candidate_caption = Generated_Answer[i].strip()
            gt_caption = re.sub(pattern, ' ', Actual_Answer[i]).strip()
            bleu_score = 0.0
            if '#' not in gt_caption:
                bleu_score = calc_single_blue_score(candidate_caption, gt_caption, translator, stops,
                                                         stemmer)
            else:
                candidate_gt_captions = gt_caption.split("#")
                bleu_scores_of_all_possibilities = []
                for gt_caption in candidate_gt_captions:
                    bleu_scores_of_all_possibilities.append(
                        calc_single_blue_score(candidate_caption, gt_caption, translator, stops,
                                                    stemmer))
                bleu_score = max(bleu_scores_of_all_possibilities)

            # Increase calculated score
            current_score += bleu_score

        return round(current_score / max_score, 4)

def calc_single_blue_score(candidate_caption, gt_caption, translator, stops, stemmer):

        # Split caption into individual words (remove punctuation)
        candidate_words = nltk.tokenize.word_tokenize(candidate_caption.translate(translator))
        gt_words = nltk.tokenize.word_tokenize(gt_caption.translate(translator))

        candidate_words = [stemmer.stem(word) for word in candidate_words]
        gt_words = [stemmer.stem(word) for word in gt_words]

        # Calculate BLEU score for the current caption
        try:
            # If both the GT and candidate are empty, assign a score of 1 for this caption
            if len(gt_words) == 0 and len(candidate_words) == 0:
                bleu_score = 1
            # Calculate the BLEU score
            else:
                bleu_score = nltk.translate.bleu_score.sentence_bleu([gt_words], candidate_words,
                                                                     smoothing_function=SmoothingFunction().method0,weights=(1, 0, 0, 0))
                #print(gt_words,"----",candidate_words,"----",bleu_score)
        # Handle problematic cases where BLEU score calculation is impossible
        except ZeroDivisionError:
            pass
            # raise Exception('Problem with {} {}', gt_words, candidate_words)
        return bleu_score

In [None]:
print('BLEU SCORE --> ',calculateBLEU([ref],[pred]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


BLEU SCORE -->  0.1875


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
