### Import the annotation file before running the code
### You can find the annotation file here: [annotation file](https://www.kaggle.com/datasets/deevanshik/annotation-file)

In [4]:
import json

# Load the annotation file
with open("/kaggle/input/annotation-file/instances_train2017_seen.json", "r") as f:
    coco_data = json.load(f)

base_categories = {cat['id']: cat['name'] for cat in coco_data['categories']}


In [13]:
# Sampling 2k Images randomly from this file
import random

ids=[] # List of all the image ids in the file
for img in coco_data["images"]:
    ids.append(img["id"])

sampled_image_ids = random.sample(ids, 2000)

In [7]:
import os
output_dir = "output_texts/"
os.makedirs(output_dir, exist_ok=True)

In [14]:
import os
from collections import defaultdict

# Precompute a mapping of image_id to annotations
image_to_annotations = defaultdict(list)
for ann in coco_data['annotations']:
    image_to_annotations[ann['image_id']].append(ann)

# Precompute category names
id_to_name = {cat_id: base_categories[cat_id] for cat_id in base_categories}

# Optimized loop
for image_id in sampled_image_ids:

    # Fetch annotations for the current image
    annotations = image_to_annotations[image_id]
    
    # Get unique category IDs and map to category names
    object_categories = {ann['category_id'] for ann in annotations}  # Use set comprehension
    object_names = [id_to_name[cat_id] for cat_id in object_categories]
    
    # Construct the prompt
    if object_names:
        prompt = f"A photograph of {' and '.join(object_names)}"
    else:
        prompt = "An image without objects of interest."
    
    # Save the prompt in a .txt file
    txt_file_name = os.path.join(output_dir, f"{image_id}.txt")
    with open(txt_file_name, "w") as txt_file:
        txt_file.write(prompt)
    idx = idx+1

print("A sample of prompt is: ")
print(prompt)
print("Prompts saved as .txt files.")


A sample of prompt is: 
A photograph of person and car and bench
Prompts saved as .txt files.


In [15]:
import shutil
shutil.make_archive("output_texts", 'zip', "output_texts")

'/kaggle/working/output_texts.zip'