# 🧠 Image Captioning using InceptionV3 + LSTM
This notebook loads the Flickr8k dataset, extracts image features using InceptionV3, and trains a caption generation model using LSTM.

In [3]:
# ✅ Step 1: Setup
!pip install tensorflow keras matplotlib nltk pillow opencv-python
import os
import numpy as np
import string
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from PIL import Image
import matplotlib.pyplot as plt
import pickle
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# ✅ Step 2: Load and clean captions
def load_captions(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
    captions = {}
    for line in lines:
        image_id, caption = line.strip().split('\t')
        image_id = image_id.split('#')[0]
        if image_id not in captions:
            captions[image_id] = []
        captions[image_id].append(caption)
    return captions

def clean_caption(caption):
    caption = caption.lower()
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = caption.split()
    caption = [word for word in caption if len(word) > 1]
    caption = ' '.join(caption)
    return caption

captions_file = './dataset/Flickr8k_text/Flickr8k.token.txt'
captions_dict = load_captions(captions_file)

cleaned_captions = {}
for img_id, caption_list in captions_dict.items():
    cleaned_list = [f"startseq {clean_caption(c)} endseq" for c in caption_list]
    cleaned_captions[img_id] = cleaned_list

all_captions = []
for cap_list in cleaned_captions.values():
    all_captions.extend(cap_list)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 8811


In [9]:
# ✅ Step 3: Extract image features using InceptionV3
base_model = InceptionV3(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

def preprocess_image(image_path):
    img = load_img(image_path, target_size=(299, 299))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

def encode_image(image_path):
    img = preprocess_image(image_path)
    feature_vector = model.predict(img)
    return np.reshape(feature_vector, (2048,))

image_dir = './dataset/Flicker8k_Dataset'

features_dict = {}
for img_name in os.listdir(image_dir):
    if img_name.endswith('.jpg'):
        features_dict[img_name] = encode_image(os.path.join(image_dir, img_name))

with open('image_features.pkl', 'wb') as f:
    pickle.dump(features_dict, f)
print("Image features saved.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("✅ Tokenizer saved.")


✅ Tokenizer saved.
