In [1]:
import os
import re
from collections import defaultdict
import pickle

caption_file = 'data/captions.txt'

def load_captions(filename):
    """
    Loads all image captions into a dictionary.
    Assumes format: image.jpg,Caption
    Skips header line if present.
    """
    captions_dict = defaultdict(list)

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if len(line) == 0 or line.lower().startswith("image,caption"):
                continue  # Skip empty lines or header

            parts = line.split(',', 1)  # Only split at the first comma
            if len(parts) != 2:
                continue

            image_id = parts[0].strip()
            caption = parts[1].strip().replace('\r', '')

            captions_dict[image_id].append(caption)

    return captions_dict


def clean_caption(caption):
    """
    Lowercase the caption, remove punctuation and numbers.
    """
    caption = caption.lower()
    caption = re.sub(r"[^a-z ]+", "", caption) 
    caption = caption.strip()
    return caption

def clean_all_captions(captions_dict):
    """
    Apply cleaning to all captions in the dictionary.
    """
    cleaned = dict()
    for img_id, captions in captions_dict.items():
        cleaned[img_id] = [clean_caption(c) for c in captions]
    return cleaned

captions_dict = load_captions(caption_file)
cleaned_captions = clean_all_captions(captions_dict)

print(f"✅ Total Images Loaded: {len(cleaned_captions)}\n")

for i, (img, caps) in enumerate(cleaned_captions.items()):
    print(f"{img} → {caps}")
    if i == 2:
        break

os.makedirs('pkl', exist_ok=True)

output_path = 'pkl/cleaned_captions.pkl'

with open(output_path, 'wb') as f:
    pickle.dump(cleaned_captions, f)

print(f"\nCleaned captions saved to: {output_path}")

✅ Total Images Loaded: 8091

1000268201_693b08cb0e.jpg → ['a child in a pink dress is climbing up a set of stairs in an entry way', 'a girl going into a wooden building', 'a little girl climbing into a wooden playhouse', 'a little girl climbing the stairs to her playhouse', 'a little girl in a pink dress going into a wooden cabin']
1001773457_577c3a7d70.jpg → ['a black dog and a spotted dog are fighting', 'a black dog and a tricolored dog playing with each other on the road', 'a black dog and a white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']
1002674143_1b742ab4b8.jpg → ['a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl', 'a little girl is sitting in front of a large painted rainbow', 'a small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it', 'there is a girl with pigtai