#New model using InceptionV3



#import libraries

In [None]:
!pip install tensorflow==2.12.0
!pip install numpy
!pip install matplotlib
!pip install pillow
!pip install tqdm

import os
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical, plot_model
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image




#Download Dataset

In [None]:
# Flickr8k Dataset (small, manageable)
!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
!unzip -q Flickr8k_Dataset.zip -d images/

!wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
!unzip -q Flickr8k_text.zip


#Load and Clean Captions

In [None]:
def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def load_descriptions(doc):
    descriptions = dict()
    for line in doc.strip().split('\n'):
        tokens = line.split()
        img_id, img_desc = tokens[0].split('.')[0], tokens[1:]
        img_desc = ' '.join(img_desc)
        if img_id not in descriptions:
            descriptions[img_id] = []
        descriptions[img_id].append(img_desc)
    return descriptions

def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i].lower()
            desc = desc.translate(table)
            desc = desc.split()
            desc = [word for word in desc if len(word) > 1 and word.isalpha()]
            desc_list[i] = 'startseq ' + ' '.join(desc) + ' endseq'

filename = 'Flickr8k.token.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)
clean_descriptions(descriptions)

print(f"Sample descriptions for image 1000268201_693b08cb0e:")
print(descriptions['1000268201_693b08cb0e'])


Sample descriptions for image 1000268201_693b08cb0e:
['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq']


#Extract Image Features Using Pretrained InceptionV3

In [None]:
def extract_features(directory):
    model = InceptionV3(weights='imagenet')
    model = Model(model.input, model.layers[-2].output)  # remove last layer
    features = dict()
    for img_name in tqdm(os.listdir(directory)):
        img_path = os.path.join(directory, img_name)
        img = load_img(img_path, target_size=(299,299))
        x = img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        feature = model.predict(x, verbose=0)
        img_id = img_name.split('.')[0]
        features[img_id] = feature[0]
    return features

features = extract_features('images/Flicker8k_Dataset')
print(f"Extracted features for {len(features)} images")


100%|██████████| 8091/8091 [52:23<00:00,  2.57it/s]

Extracted features for 8091 images





#Prepare Tokenizer and Sequences

In [None]:
# Flatten all captions
all_captions = []
for key in descriptions.keys():
    all_captions.extend(descriptions[key])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

max_length = max(len(desc.split()) for desc in all_captions)
print(f"Maximum caption length: {max_length}")

def create_sequences(tokenizer, max_length, desc_list, photo_feature, vocab_size):
    X1, X2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo_feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


Vocabulary Size: 8766
Maximum caption length: 34


#Data Generator

In [None]:
def data_generator(descriptions, features, tokenizer, max_length, vocab_size, batch_size):
    keys = list(descriptions.keys())
    n = len(keys)
    while True:
        for i in range(0, n, batch_size):
            X1, X2, y = [], [], []
            for key in keys[i:i+batch_size]:
                photo_feature = features[key]
                desc_list = descriptions[key]
                in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo_feature, vocab_size)
                for j in range(len(in_img)):
                    X1.append(in_img[j])
                    X2.append(in_seq[j])
                    y.append(out_word[j])
            yield [np.array(X1), np.array(X2)], np.array(y)


Define Model

In [None]:
def define_model(vocab_size, max_length):
    # Feature extractor (image input)
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence input (caption input)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (combine)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = define_model(vocab_size, max_length)
model.summary()


#Train Model

In [None]:
batch_size = 64
steps = len(descriptions) // batch_size

# Filter descriptions to only include keys present in features
filtered_descriptions = {key: descriptions[key] for key in features.keys() if key in descriptions}

generator = data_generator(filtered_descriptions, features, tokenizer, max_length, vocab_size, batch_size)
history = model.fit(generator, epochs=20, steps_per_epoch=steps, verbose=1)

NameError: name 'model' is not defined

#Save Model

In [None]:
model.save('image_caption_generator.h5')


#Caption Generation Function

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_caption(model, tokenizer, photo_feature, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_feature.reshape(1,2048), sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text


#Test on a Sample Image

In [None]:
# Pick an image key from test set (or any image you have features for)
sample_img_id = list(features.keys())[0]
sample_feature = features[sample_img_id]

caption = generate_caption(model, tokenizer, sample_feature, max_length)
print("Generated Caption:", caption)


In [None]:
import pickle

# Save the features to a file
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)

print("Image features saved to features.pkl")