
### **Image Captioning with Text to speech translation**

In [1]:
import os
import zipfile
import string
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
# Paths to dataset files
images_zip = "Flickr8k_Dataset.zip"
text_zip = "Flickr8k_text.zip"

data_dir = "./Flickr8k_Dataset"
text_dir = "./Flickr8k_text"

In [3]:
# Extract images
if not os.path.exists(data_dir):
    with zipfile.ZipFile(images_zip, 'r') as zip_ref:
        zip_ref.extractall(".")

# Extract text files
if not os.path.exists(text_dir):
    with zipfile.ZipFile(text_zip, 'r') as zip_ref:
        zip_ref.extractall(".")

In [4]:
# Load captions file
captions_file = "./Flickr8k.token.txt"
with open(captions_file, "r") as file:
    captions = file.readlines()

# Process captions
def process_caption(caption):
    caption = caption.lower().strip()
    caption = caption.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    caption = "startseq " + caption + " endseq"  # Add start and end tokens
    return caption

In [5]:
# Mapping images to captions
image_captions = defaultdict(list)
for line in captions:
    img, caption = line.split("\t")
    img = img.split("#")[0]
    processed_caption = process_caption(caption)
    image_captions[img].append(processed_caption)

# Select first 1000 images for training
image_list = sorted(image_captions.keys())[:1000]
image_captions = {img: image_captions[img] for img in image_list}

In [6]:
# Save to a DataFrame for easy access
df = pd.DataFrame([(img, cap) for img, caps in image_captions.items() for cap in caps], columns=["image", "caption"])

# Save processed captions to a CSV file
df.to_csv("processed_captions.csv", index=False)

In [7]:
import pickle
from collections import defaultdict
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model

In [8]:
# Feature Extraction using ResNet50
model = ResNet50(weights='imagenet')
model = Model(inputs=model.input, outputs=model.layers[-2].output)  # Remove last classification layer

features = {}

for img_name in image_list:
    img_path = os.path.join(data_dir, img_name)
    if os.path.exists(img_path):
        img = load_img(img_path, target_size=(224, 224))  # Resize image to 224x224
        img = img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        feature = model.predict(img, verbose=0)
        features[img_name] = feature.flatten()

with open("captions.pkl", "wb") as f:
    pickle.dump(image_captions, f)

# Save extracted features
with open("image_features.pkl", "wb") as f:
    pickle.dump(features, f)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
[1m102967424/102967424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

# Text Preprocessing
all_captions = [cap for caps in image_captions.values() for cap in caps]

# Build vocabulary
word_counts = Counter()
for caption in all_captions:
    word_counts.update(caption.split())

vocab_threshold = 5  # Keep words appearing at least 5 times
vocab = [word for word, count in word_counts.items() if count >= vocab_threshold]


In [10]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocab)
vocab_size = len(tokenizer.word_index) + 1  # Include padding index

# Convert captions to sequences
sequences = tokenizer.texts_to_sequences(all_captions)

# Padding to ensure uniform length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Save tokenizer and processed sequences
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("padded_sequences.pkl", "wb") as f:
    pickle.dump(padded_sequences, f)

In [11]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Add, Input
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

In [12]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# Load pre-trained GloVe embeddings
def load_glove_embeddings(filepath="glove.6B.200d.txt", embedding_dim=200):
    embeddings_index = {}
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings()

embedding_matrix = np.zeros((vocab_size, 200))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

--2025-03-09 15:13:48--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-09 15:13:48--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-09 15:13:48--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# Define Encoder-Decoder Model
embedding_dim = 200

def define_model():
    # Image feature input
    image_input = Input(shape=(2048,))
    image_fc = Dense(256, activation='relu')(image_input) 

    # Text input
    text_input = Input(shape=(max_length,))
    text_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(text_input)
    text_lstm = LSTM(256, return_sequences=True)(text_embedding)
    text_lstm = LSTM(256)(text_lstm)

    # Merging image and text features
    decoder = Add()([image_fc, text_lstm])
    decoder = Dense(256, activation='relu')(decoder)
    output = Dense(vocab_size, activation='softmax')(decoder)

    model = tf.keras.Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = define_model()
print(model.summary())

None


In [14]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4


In [22]:
import random
import numpy as np
import pickle
import tensorflow as tf
from nltk.translate.bleu_score import corpus_bleu
from gtts import gTTS
import os
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load processed data
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

with open("image_features.pkl", "rb") as f:
    image_features = pickle.load(f)

# Assuming image_captions is available from previous steps:
with open("captions.pkl", "wb") as f:
    pickle.dump(image_captions, f)

with open("captions.pkl", "rb") as f:
    captions = pickle.load(f)

vocab_size = len(tokenizer.word_index) + 1
max_length = 34

# Splitting data (80% train, 20% validation)
image_ids = list(captions.keys())
train_ids, val_ids = train_test_split(image_ids, test_size=0.2, random_state=42)

# **Fixed Data Generator**
def data_generator(image_ids, captions, tokenizer, max_length, batch_size=32):
    while True:
        random.shuffle(image_ids)
        for i in range(0, len(image_ids), batch_size):
            batch_ids = image_ids[i:i+batch_size]
            X_img, X_seq, y = [], [], []

            for img_id in batch_ids:
                if img_id not in image_features:
                    print(f"⚠️ Warning: {img_id} not found in image_features. Skipping...")
                    continue  # Skip missing image features

                img_feature = image_features[img_id]

                # Ensure image feature has correct shape
                if img_feature.shape != (2048,):
                    print(f"⚠️ Warning: {img_id} has incorrect shape {img_feature.shape}, expected (2048,). Skipping...")
                    continue  # Skip incorrectly shaped images

                for caption in captions[img_id]:
                    seq = tokenizer.texts_to_sequences([caption])[0]

                    for j in range(1, len(seq)):
                        in_seq, out_seq = seq[:j], seq[j]

                        in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                        out_seq = to_categorical(out_seq, num_classes=vocab_size, dtype='float32')

                        X_img.append(img_feature)
                        X_seq.append(in_seq)
                        y.append(out_seq)

            # **Fix: Ensure batch is not empty before yielding**
            if len(X_img) == 0:
                print("⚠️ Skipping empty batch...")
                continue  # Skip empty batches

            yield (
                (np.array(X_img, dtype=np.float32), np.array(X_seq, dtype=np.int32)),
                np.array(y, dtype=np.float32)
            )

model = define_model()


In [23]:
batch_size = 32
# **Define Output Signature**
output_signature = (
    (
        tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)
    ),
    tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)
)

# **Create Training Dataset**
train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_ids, captions, tokenizer, max_length, batch_size),
    output_signature=output_signature
)

# **Create Validation Dataset**
val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_ids, captions, tokenizer, max_length, batch_size),
    output_signature=output_signature
)

In [None]:
# **Train Model**
history = model.fit(
    train_dataset,
    epochs=20,
    steps_per_epoch=max(1, len(train_ids) // batch_size),
    validation_data=val_dataset,
    validation_steps=max(1, len(val_ids) // batch_size),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty batch...
⚠️ Skipping empty

In [1]:
# Load Best Model
model.load_weights("best_model.h5")

# BLEU Score Evaluation
def evaluate_bleu(model, image_features, captions, tokenizer, max_length):
    actual, predicted = [], []
    for img_id in random.sample(list(image_features.keys()), 100):
        feature = image_features[img_id]
        y_pred = generate_caption(model, feature, tokenizer, max_length)
        actual.append([caption.split() for caption in captions[img_id]])
        predicted.append(y_pred.split())
    bleu_score = corpus_bleu(actual, predicted)
    print(f"BLEU Score: {bleu_score:.4f}")

def generate_caption(model, image_feature, tokenizer, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        seq = pad_sequences([seq], maxlen=max_length)
        y_pred = model.predict([image_feature.reshape(1, 2048), seq], verbose=0)
        y_pred = np.argmax(y_pred)
        word = {index: word for word, index in tokenizer.word_index.items()}.get(y_pred)
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text

evaluate_bleu(model, image_features, captions, tokenizer, max_length)

NameError: name 'model' is not defined

In [2]:
# Test with GIKI Images
def load_giki_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (299, 299))  # Resize for InceptionV3
    image = image / 255.0
    return np.expand_dims(image, axis=0)

def extract_features(image_path, model):
    image = load_giki_image(image_path)
    feature = model.predict(image)
    return feature.reshape(-1)

cnn_model = tf.keras.applications.InceptionV3(include_top=False, pooling="avg")
giki_images = ["giki1.jpg", "giki2.jpg", "giki3.jpg"]

for img_path in giki_images:
    feature = extract_features(img_path, cnn_model)
    caption = generate_caption(model, feature, tokenizer, max_length)
    print(f"Generated Caption: {caption}")

    # Convert caption to voice
    tts = gTTS(text=caption, lang='en')
    audio_path = img_path.replace('.jpg', '.mp3')
    tts.save(audio_path)
    os.system(f"mpg321 {audio_path}")

    # Display Image
    image = cv2.imread(img_path)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.title(caption)
    plt.show()

NameError: name 'tf' is not defined