In [1]:
import os
import string
import re
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from tqdm import tqdm


In [5]:
dataset_path = "Data/Images"
caption_path = "Data/captions.txt"

In [9]:
with open("Data/captions.txt", "r") as file:
    for i in range(5):
        print(repr(file.readline()))


'image,caption\n'
'1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .\n'
'1000268201_693b08cb0e.jpg,A girl going into a wooden building .\n'
'1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .\n'
'1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .\n'


In [10]:
import csv

def load_captions(filename):
    captions_dict = {}
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # skip header: image,caption
        for row in reader:
            if len(row) != 2:
                continue  # skip malformed lines
            image_id, caption = row
            image_id = image_id.strip()
            caption = caption.strip()

            if image_id not in captions_dict:
                captions_dict[image_id] = []
            captions_dict[image_id].append(caption)
    return captions_dict


In [11]:
caption_path = "Data/captions.txt"
captions_dict = load_captions(caption_path)
print(f"Loaded {len(captions_dict)} image captions.")


Loaded 8091 image captions.


In [12]:
import re

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r'[^\w\s]', '', caption)  # remove punctuation
    caption = re.sub(r'\s+', ' ', caption)     # remove extra spaces
    caption = caption.strip()
    return caption


In [13]:
for img_id, captions in captions_dict.items():
    cleaned = [clean_caption(c) for c in captions]
    captions_dict[img_id] = [f"<start> {c} <end>" for c in cleaned]


In [14]:
all_captions = []
for captions in captions_dict.values():
    all_captions.extend(captions)


In [15]:
from collections import Counter

word_counts = Counter()
for caption in all_captions:
    word_counts.update(caption.split())

# Filter out rare words (occurring less than 5 times)
threshold = 5
vocab = [word for word, count in word_counts.items() if count >= threshold]

# Add special tokens
vocab = ['<pad>', '<start>', '<end>', '<unk>'] + vocab
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 2997


In [16]:
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

# Save for later use
import numpy as np
np.save('Data/word_to_index.npy', word_to_index)
np.save('Data/index_to_word.npy', index_to_word)


In [17]:
caption_lengths = [len(caption.split()) for caption in all_captions]
MAX_LENGTH = max(caption_lengths)  # or use np.percentile(caption_lengths, 95)
print("Max caption length:", MAX_LENGTH)


Max caption length: 38


In [18]:
def caption_to_seq(caption, word_to_index, max_length):
    tokens = caption.split()
    seq = [word_to_index.get(word, word_to_index['<unk>']) for word in tokens]
    
    # Pad or truncate
    if len(seq) < max_length:
        seq += [word_to_index['<pad>']] * (max_length - len(seq))
    else:
        seq = seq[:max_length]
        
    return seq


In [19]:
all_sequences = []

for captions in captions_dict.values():
    for caption in captions:
        seq = caption_to_seq(caption, word_to_index, MAX_LENGTH)
        all_sequences.append(seq)

all_sequences = np.array(all_sequences)
print("All caption sequences shape:", all_sequences.shape)


All caption sequences shape: (40455, 38)


In [20]:
np.save("Data/captions_sequences.npy", all_sequences)


PRE PROCESSING IMAGE

In [21]:
import os
import numpy as np
from tqdm import tqdm
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model


In [22]:
# Load InceptionV3 and remove the final classification layer
base_model = InceptionV3(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
[1m96112376/96112376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 0us/step


In [23]:
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(299, 299))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)


In [24]:
image_dir = "Data/Images"  # your image folder

def extract_features(image_dir):
    features = {}
    for img_name in tqdm(os.listdir(image_dir)):
        img_path = os.path.join(image_dir, img_name)

        try:
            img_tensor = preprocess_image(img_path)
            feature_vector = model.predict(img_tensor, verbose=0)
            features[img_name] = feature_vector.flatten()
        except Exception as e:
            print(f"Error processing {img_name}: {e}")
    
    return features

features = extract_features(image_dir)
print(f"Extracted features for {len(features)} images.")


100%|██████████| 8091/8091 [30:58<00:00,  4.35it/s]

Extracted features for 8091 images.





In [25]:
np.save("Data/image_features.npy", features)
