In [41]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

import os
os.makedirs(working_directory, exist_ok=True)

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.utils import to_categorical, plot_model


PermissionError: [Errno 13] Permission denied: '/Image Cap'

In [42]:
# Set base and working directories to absolute paths
Base_DIR = '/home/ashutosh/Desktop/Image Cap'
WORKING_DIR = '/home/ashutosh/Desktop/Image Cap'
# Create working directory if it doesn't exist
os.makedirs(WORKING_DIR, exist_ok=True)


In [43]:
## Extract Image Features ##

model = VGG16()
# Restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# summarize the model
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [None]:
# Extract image features safely and save them as a pickle
features = {}
directory = os.path.join(Base_DIR, 'Images')

# Ensure directory exists
if not os.path.isdir(directory):
    raise ValueError(f"Image directory not found: {directory}")

# Filter for common image file extensions
image_files = [f for f in os.listdir(directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
if len(image_files) == 0:
    raise ValueError(f"No images found in directory: {directory}. Please add images to the directory.")

print("Starting feature extraction...")
total_images = len(image_files)
for i, img_name in enumerate(image_files):
    img_path = os.path.join(directory, img_name)
    try:
        img = load_img(img_path, target_size=(224, 224))
    except Exception as e:
        print(f"Skipping '{img_name}': {e}")
        continue
    # Convert the image pixels to a numpy array
    image = img_to_array(img)
    # Reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # Prepare the image for the VGG model
    image = preprocess_input(image)
    # Get feature vector
    feature = model.predict(image, verbose=0)
    # Get image ID (filename without extension)
    image_id = os.path.splitext(img_name)[0]
    # Store feature as a 1D array
    features[image_id] = feature.flatten()
    
    # Print progress every 100 images
    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{total_images} images")

# Save the extracted features to disk for later use
features_path = os.path.join(WORKING_DIR, 'features.pkl')
with open(features_path, 'wb') as f:
    pickle.dump(features, f)

print(f"Saved features for {len(features)} images to: {features_path}")


In [None]:
# Store Feature In a Pickle File #
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [44]:
#load the features back
features = pickle.load(open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb'))

In [45]:
from tqdm import tqdm
import os

# Read the captions file
with open(os.path.join(Base_DIR, 'captions.txt'), 'r') as f:
    next(f)  # Skip header line if exists
    captions_doc = f.read()

# Create mapping of image to captions
mapping = {}

# Process lines
for line in tqdm(captions_doc.split('\n')):
    # Skip empty lines
    if len(line) < 2:
        continue

    # Split the line into image filename and caption
    tokens = line.split(',', 1)
    if len(tokens) < 2:
        continue

    image_id, caption = tokens[0], tokens[1]
    image_id = image_id.split('.')[0]

    # Store captions
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption.strip())


100%|██████████| 40456/40456 [00:00<00:00, 541178.08it/s]


In [46]:
len(mapping)

8091

In [47]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            # Convert to lowercase
            caption = caption.lower()
            # Remove punctuation
            caption = caption.replace('[A-Za-z]', '')
            
            caption = ''.join(char for char in caption if char.isalnum() or char.isspace())
            # Remove short words
            caption = ' '.join(word for word in caption.split() if len(word) > 1)
            # Add start and end tokens
            caption = 'startseq ' + caption + ' endseq'
            captions[i] = caption

In [48]:
# before preprocessing of text 
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [49]:
# Preprocess the captions
clean(mapping)

In [50]:
mapping['1000268201_693b08cb0e']

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

In [51]:
# tokenization the captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts([caption for captions in mapping.values() for caption in captions])
vocab_size = len(tokenizer.word_index) + 1

In [52]:
vocab_size

8811

In [53]:
# maximum length of caption
max_length = max(len(caption.split()) for captions in mapping.values() for caption in captions)
max_length

34

In [54]:
# Train Test Split #
image_ids = list(mapping.keys())
split = int(0.9 * len(image_ids))
train = image_ids[:split]
test = image_ids[split:]

In [55]:
# Cereate datagenerator to get batches of data for training
def data_generator(data_keys ,mapping, features, tokenizer, max_length, batch_size):
    while True:
        np.random.shuffle(data_keys)
        for i in range(0, len(data_keys), batch_size):
            X1, X2, y = [], [], []
            n=0 
            batch_keys = data_keys[i:i + batch_size]
            for key in batch_keys:
                n += 1
                captions = mapping[key]
                feature = features[key]
                for caption in captions:
                    seq = tokenizer.texts_to_sequences([caption])[0]
                    for j in range(1, len(seq)):
                        in_seq, out_seq = seq[:j], seq[j]
                        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                        X1.append(feature[key][0])
                        X2.append(in_seq)
                        y.append(out_seq)
                if n == batch_size:
                        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                        yield [X1, X2], y
                        X1, X2, y = [], [], []
                        n=0
                        

In [61]:
# model creation 

# image feature extractor model - Encoder 
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# Sequence model -
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)
# Decoder (feed both models)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)


model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# model.summary()
plot_model(model,show_shapes=True)



You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


Collecting pydot
  Downloading pydot-4.0.1-py3-none-any.whl.metadata (11 kB)
Downloading pydot-4.0.1-py3-none-any.whl (37 kB)
Installing collected packages: pydot
Successfully installed pydot-4.0.1
