pip install tensorflow keras matplotlib nltk


https://www.kaggle.com/datasets/adityajn105/flickr8k
dataset Flicker

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import numpy as np
import os

model = InceptionV3(weights='imagenet')
cnn_model = Model(inputs=model.input, outputs=model.layers[-2].output) 

In [None]:
def extract_features(directory):
    features = {}
    for img_name in os.listdir(directory):
        img_path = os.path.join(directory, img_name)
        img = image.load_img(img_path, target_size=(299, 299))
        img = image.img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        feature = cnn_model.predict(img)
        features[img_name] = feature
    return features


In [None]:
import string
import re

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r"[^a-zA-Z ]", "", caption)
    caption = 'startseq ' + caption + ' endseq'
    return caption

# Clean and tokenize all captions

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)  # List of all cleaned captions

vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in all_captions)


In [None]:
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

def define_model(vocab_size, max_length):
    # Feature extractor (image input)
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence processor (text input)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (combine features)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model
