In [1]:
# Step 1: Imports
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import re

In [2]:
# Step 2: File paths (edit if needed)
ANNOTATIONS_PATH = '../data/annotations/captions_train2017.json'
IMAGE_FOLDER = '../data/train2017'
OUTPUT_CSV_PATH = '../data/captions.csv'

In [None]:
# Step 3: Load COCO annotations
with open(ANNOTATIONS_PATH, 'r') as f:
    coco_data = json.load(f)

print("Loaded", len(coco_data['annotations']), "captions.")

In [None]:
# Step 4: Create mapping: image_id → file_name
image_id_to_file = {}
for img in coco_data['images']:
    image_id_to_file[img['id']] = img['file_name']


In [None]:
# Step 5: Create list of image-caption pairs
image_paths = []
captions = []

for ann in tqdm(coco_data['annotations']):
    img_id = ann['image_id']
    caption = ann['caption']
    file_name = image_id_to_file.get(img_id)

    if file_name:
        image_path = os.path.join('train2017', file_name)
        image_paths.append(image_path)
        captions.append(caption)

In [None]:
# Step 6: Clean captions
def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r"[^a-z0-9\s]", "", caption)
    caption = re.sub(r"\s+", " ", caption)
    caption = caption.strip()
    caption = f"<start> {caption} <end>"
    return caption

captions = [clean_caption(cap) for cap in captions]

In [None]:
# Step 7: Save to CSV
df = pd.DataFrame({
    'image': image_paths,
    'caption': captions
})
df.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"✅ Saved {len(df)} image-caption pairs to {OUTPUT_CSV_PATH}")
df.head()