<a href="https://colab.research.google.com/github/Arti-Kasaudhan/BackBencher-resturant/blob/main/image_Caption_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.text import Tokenizer

# --- Configuration ---
MODEL_DIR = 'models'
FEATURES_DIR = 'features'
CAPTIONS_FILE = 'data/Flickr8k.token.txt'
IMAGES_DIR = 'data/Flicker8k_Dataset'
WORKING_DIR = 'working'

# Create directories if they don't exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(FEATURES_DIR, exist_ok=True)
os.makedirs(WORKING_DIR, exist_ok=True)


# --- Data Loading and Preprocessing ---

def load_doc(filename):
    """Loads a document from a file."""
    with open(filename, 'r') as file:
        text = file.read()
    return text

def load_descriptions(doc):
    """Loads image descriptions (captions) from the document."""
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

def clean_descriptions(descriptions):
    """Cleans the descriptions by removing punctuation and converting to lowercase."""
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word)>1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] =  ' '.join(desc)

def to_vocabulary(descriptions):
    """Creates a vocabulary of all unique words in the descriptions."""
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

def save_descriptions(descriptions, filename):
    """Saves the cleaned descriptions to a file."""
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    with open(filename, 'w') as file:
        file.write(data)

# --- Image Feature Extraction ---

def extract_features(directory):
    """Extracts features from all images in the directory."""
    model = InceptionV3(weights='imagenet')
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())
    features = dict()
    for name in os.listdir(directory):
        filename = os.path.join(directory, name)
        image = load_img(filename, target_size=(299, 299))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        features[image_id] = feature
        print(f'> {name}')
    return features

def main():
    """Main function to run the image captioning model."""
    print("Image Caption Generator")

    # --- Text Processing ---
    # Create a dummy captions file for demonstration
    if not os.path.exists(CAPTIONS_FILE):
        os.makedirs(os.path.dirname(CAPTIONS_FILE), exist_ok=True)
        with open(CAPTIONS_FILE, 'w') as f:
            f.write("1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .\n")
            f.write("1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .\n")

    doc = load_doc(CAPTIONS_FILE)
    descriptions = load_descriptions(doc)
    print(f'Loaded: {len(descriptions)} descriptions')
    clean_descriptions(descriptions)
    vocabulary = to_vocabulary(descriptions)
    print(f'Vocabulary Size: {len(vocabulary)}')
    cleaned_descriptions_path = os.path.join(WORKING_DIR, 'descriptions.txt')
    save_descriptions(descriptions, cleaned_descriptions_path)
    print(f"Cleaned descriptions saved to {cleaned_descriptions_path}")

    # --- Image Feature Extraction ---
    # In a real scenario, you would have the Flickr8k dataset images.
    # We'll simulate this by checking if the directory exists.
    if os.path.exists(IMAGES_DIR) and len(os.listdir(IMAGES_DIR)) > 0:
        features = extract_features(IMAGES_DIR)
        features_path = os.path.join(FEATURES_DIR, 'features.pkl')
        with open(features_path, 'wb') as f:
            pickle.dump(features, f)
        print(f"Extracted features saved to {features_path}")
    else:
        print(f"\n--- Skipping Feature Extraction ---")
        print(f"Image directory '{IMAGES_DIR}' not found or is empty.")
        print("Please download the Flickr8k dataset and place the images in the 'data/Flicker8k_Dataset' directory.")

    # --- Model Training ---
    # Load training data
    train_features_path = os.path.join(FEATURES_DIR, 'features.pkl')
    if os.path.exists(train_features_path):
        train_descriptions = load_set(os.path.join(WORKING_DIR, 'descriptions.txt'))
        print('Descriptions: train=%d' % len(train_descriptions))
        train_features = load_photo_features(train_features_path, set(train_descriptions.keys()))
        print('Photos: train=%d' % len(train_features))

        # Prepare tokenizer
        tokenizer = create_tokenizer(train_descriptions)
        vocab_size = len(tokenizer.word_index) + 1
        print('Vocabulary Size: %d' % vocab_size)

        # Create sequences
        max_length = max_length(train_descriptions)

        # Define and train the model
        model = create_model(vocab_size, max_length)

        # Create a data generator
        generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)

        # Fit model
        print("\n--- Starting Model Training ---")
        model.fit(generator, epochs=20, steps_per_epoch=len(train_descriptions), verbose=1)

        # Save model
        model.save(os.path.join(MODEL_DIR, 'image_caption_model.h5'))
        print("\n--- Model Training Complete ---")
    else:
        print("\n--- Skipping Model Training ---")
        print("Feature file not found. Please run feature extraction first.")

# --- Data Loading for Training ---

def load_set(filename):
    """Loads a pre-set list of photo identifiers."""
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

def load_clean_descriptions(filename, dataset):
    """Loads and cleans descriptions for a given dataset."""
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

def load_photo_features(filename, dataset):
    """Loads photo features for a given dataset."""
    all_features = pickle.load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}
    return features

# --- Tokenizer and Sequence Creation ---

def create_tokenizer(descriptions):
    """Creates and fits a tokenizer on the descriptions."""
    desc_list = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

def to_lines(descriptions):
    """Converts a dictionary of descriptions to a list of strings."""
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

def max_length(descriptions):
    """Calculates the maximum length of any description."""
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

# --- Model Definition ---

def create_model(vocab_size, max_length):
    """Defines the captioning model."""
    # Feature extractor model
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # Sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # Decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # Tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    return model

# --- Data Generator ---

def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    """Creates a data generator for fitting the model."""
    while 1:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [in_img, in_seq], out_word

def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    """Creates sequences for a single image."""
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


# --- Caption Generation ---

def word_for_id(integer, tokenizer):
    """Maps an integer to a word."""
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    """Generates a description for an image."""
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

def main():
    """Main function to run the image captioning model."""
    print("Image Caption Generator")

    # --- Text Processing ---
    if not os.path.exists(CAPTIONS_FILE):
        os.makedirs(os.path.dirname(CAPTIONS_FILE), exist_ok=True)
        with open(CAPTIONS_FILE, 'w') as f:
            f.write("1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .\n")
            f.write("1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .\n")

    doc = load_doc(CAPTIONS_FILE)
    descriptions = load_descriptions(doc)
    print(f'Loaded: {len(descriptions)} descriptions')
    clean_descriptions(descriptions)
    vocabulary = to_vocabulary(descriptions)
    print(f'Vocabulary Size: {len(vocabulary)}')
    cleaned_descriptions_path = os.path.join(WORKING_DIR, 'descriptions.txt')
    save_descriptions(descriptions, cleaned_descriptions_path)
    print(f"Cleaned descriptions saved to {cleaned_descriptions_path}")

    # --- Image Feature Extraction ---
    if os.path.exists(IMAGES_DIR) and len(os.listdir(IMAGES_DIR)) > 0:
        features = extract_features(IMAGES_DIR)
        features_path = os.path.join(FEATURES_DIR, 'features.pkl')
        with open(features_path, 'wb') as f:
            pickle.dump(features, f)
        print(f"Extracted features saved to {features_path}")
    else:
        print(f"\n--- Skipping Feature Extraction ---")
        print(f"Image directory '{IMAGES_DIR}' not found or is empty.")
        print("Please download the Flickr8k dataset and place the images in the 'data/Flicker8k_Dataset' directory.")

    # --- Model Training ---
    train_features_path = os.path.join(FEATURES_DIR, 'features.pkl')
    model_path = os.path.join(MODEL_DIR, 'image_caption_model.h5')

    if not os.path.exists(model_path):
        if os.path.exists(train_features_path):
            train_descriptions = load_clean_descriptions(cleaned_descriptions_path, load_set(cleaned_descriptions_path))
            print('Descriptions: train=%d' % len(train_descriptions))
            train_features = load_photo_features(train_features_path, set(train_descriptions.keys()))
            print('Photos: train=%d' % len(train_features))

            tokenizer = create_tokenizer(train_descriptions)
            dump(tokenizer, open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'wb'))
            vocab_size = len(tokenizer.word_index) + 1
            print('Vocabulary Size: %d' % vocab_size)

            max_len = max_length(train_descriptions)

            model = create_model(vocab_size, max_len)

            generator = data_generator(train_descriptions, train_features, tokenizer, max_len, vocab_size)

            print("\n--- Starting Model Training ---")
            model.fit(generator, epochs=20, steps_per_epoch=len(train_descriptions), verbose=1)

            model.save(model_path)
            print("\n--- Model Training Complete ---")
        else:
            print("\n--- Skipping Model Training ---")
            print("Feature file not found. Please run feature extraction first.")
    else:
        print("\n--- Model already trained. Loading from disk. ---")

    # --- Caption Generation Example ---
    if os.path.exists(model_path) and os.path.exists(train_features_path):
        print("\n--- Generating Caption for an Example Image ---")
        tokenizer = pickle.load(open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'rb'))
        model = tf.keras.models.load_model(model_path)

        # Use the first available image feature as an example
        all_features = pickle.load(open(train_features_path, 'rb'))
        if all_features:
            example_image_id = list(all_features.keys())[0]
            photo = all_features[example_image_id]

            max_len = 34 # This should be consistent with training
            description = generate_desc(model, tokenizer, photo, max_len)
            print(f"\nImage: {example_image_id}.jpg")
            print(f"Caption: {description}")
        else:
            print("No image features found to generate a caption.")
    else:
        print("\n--- Skipping Caption Generation ---")
        print("Trained model or features not found.")


if __name__ == '__main__':
    main()

Image Caption Generator
Loaded: 1 descriptions
Vocabulary Size: 18
Cleaned descriptions saved to working/descriptions.txt

--- Skipping Feature Extraction ---
Image directory 'data/Flicker8k_Dataset' not found or is empty.
Please download the Flickr8k dataset and place the images in the 'data/Flicker8k_Dataset' directory.

--- Skipping Model Training ---
Feature file not found. Please run feature extraction first.

--- Skipping Caption Generation ---
Trained model or features not found.
