In [2]:
import kaggle
import os

# Set Kaggle API key directory (assuming kaggle.json is in the same folder as this notebook)
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

# Download the dataset
kaggle.api.dataset_download_files('adityajn105/flickr8k', path='flickr8k_data', unzip=True)

print("Download and unzip complete.")


Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
Download and unzip complete.


In [2]:
import pandas as pd

CAPTIONS_FILE = 'flickr8k_data/captions.txt'
IMAGE_DIR = 'flickr8k_data/Images'


# Load the CSV captions file
df = pd.read_csv(CAPTIONS_FILE)

# Prepare the descriptions dictionary
descriptions = {}
for img, caption in zip(df['image'], df['caption']):
    caption = caption.lower().strip()
    caption = 'startseq ' + caption + ' endseq'
    if img not in descriptions:
        descriptions[img] = []
    descriptions[img].append(caption)

print(f" Captions loaded for {len(descriptions)} images.")



 Captions loaded for 8091 images.


In [3]:
!pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable


In [4]:
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model
import numpy as np
import os
import pickle

# Load InceptionV3 and remove final classification layer
model_cnn = InceptionV3(weights='imagenet')
model_cnn = Model(inputs=model_cnn.input, outputs=model_cnn.layers[-2].output)

# Feature extractor function with limit
def extract_features(directory, limit=3000):
    features = {}
    for i, img_name in enumerate(os.listdir(directory)):
        if i >= limit:
            break
        filename = os.path.join(directory, img_name)
        try:
            image = load_img(filename, target_size=(299, 299))
            image = img_to_array(image)
            image = np.expand_dims(image, axis=0)
            image = preprocess_input(image)
            feature = model_cnn.predict(image, verbose=0)
            features[img_name] = feature.flatten()
        except Exception as e:
            print(f" Skipped {img_name}: {e}")
    return features

# Extract and save features
features = extract_features(IMAGE_DIR, limit=3000)

# Save to file
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)

print(f"Extracted features for {len(features)} images.")


Extracted features for 3000 images.


In [4]:
# Load previously saved features
import pickle

with open('features.pkl', 'rb') as f:
    features = pickle.load(f)

print(f" Loaded {len(features)} image features from file.")


 Loaded 3000 image features from file.


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Create tokenizer
def create_tokenizer(descriptions):
    all_desc = []
    for desc_list in descriptions.values():
        all_desc.extend(desc_list)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_desc)
    return tokenizer

tokenizer = create_tokenizer(descriptions)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

# Max length of captions
def max_length(descriptions):
    all_desc = [desc for descs in descriptions.values() for desc in descs]
    return max(len(d.split()) for d in all_desc)

max_len = max_length(descriptions)
print(f" Max caption length: {max_len}")


Vocabulary size: 8496
 Max caption length: 40


In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

# Create input-output pairs for training
def create_sequences(tokenizer, max_len, desc_list, photo, vocab_size):
    X1, X2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Define the captioning model
def define_model(vocab_size, max_len):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_len,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = define_model(vocab_size, max_len)


In [8]:
EPOCHS = 15

available_imgs = list(features.keys())  # Only use images with features

for i in range(EPOCHS):
    for img_id in available_imgs:
        if img_id in descriptions:
            desc_list = descriptions[img_id]
            photo = features[img_id]
            X1, X2, y = create_sequences(tokenizer, max_len, desc_list, photo, vocab_size)
            model.fit([X1, X2], y, epochs=1, verbose=0)
    print(f" Epoch {i+1}/{EPOCHS} complete.")
#  Save model and tokenizer
model.save('caption_model.h5')

import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)

print(" Model, tokenizer, and features saved.")


 Epoch 1/15 complete.
 Epoch 2/15 complete.
 Epoch 3/15 complete.
 Epoch 4/15 complete.
 Epoch 5/15 complete.
 Epoch 6/15 complete.
 Epoch 7/15 complete.
 Epoch 8/15 complete.
 Epoch 9/15 complete.
 Epoch 10/15 complete.
 Epoch 11/15 complete.
 Epoch 12/15 complete.
 Epoch 13/15 complete.
 Epoch 14/15 complete.




 Epoch 15/15 complete.
 Model, tokenizer, and features saved.


In [6]:
#  Save model and tokenizer
model.save('caption_model.h5')

import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)

print(" Model, tokenizer, and features saved.")

NameError: name 'model' is not defined

In [2]:
# Generate caption for an image
def generate_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = None
        for w, index in tokenizer.word_index.items():
            if index == yhat:
                word = w
                break
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text


In [5]:
from IPython.display import Image, display

# Pick a random image
sample_image = list(features.keys())[0]
photo = features[sample_image].reshape((1, 2048))
caption = generate_caption(model, tokenizer, photo, max_len)

display(Image(os.path.join(IMAGE_DIR, sample_image)))
print(" Caption:", caption)


NameError: name 'model' is not defined