In [3]:
import os
from collections import defaultdict
import pandas as pd
import time
from google import genai
from google.genai import types
from tqdm import tqdm
import csv

In [None]:
# Set up the API client
client = genai.Client(api_key="")

# Set the parent directory containing subfolders
parent_dir = "/Users/aryanrastogi/college/vr/vr_major/categorized_data"

In [5]:
# Valid image extensions
image_extensions = {'.jpg', '.jpeg', '.png', '.webp'}

# Dictionary to store folder and image count
folder_image_counts = defaultdict(int)

# Count images in each subfolder
for subfolder in os.listdir(parent_dir):
    subfolder_path = os.path.join(parent_dir, subfolder)
    if os.path.isdir(subfolder_path):
        count = sum(
            1 for f in os.listdir(subfolder_path)
            if os.path.splitext(f)[1].lower() in image_extensions
        )
        folder_image_counts[subfolder] = count

# Sort folders by image count (descending)
sorted_folders = sorted(folder_image_counts.items(), key=lambda x: x[1], reverse=True)

# Display results
print("Subfolders sorted by number of images:")
for folder, count in sorted_folders:
    print(f"{folder}: {count} images")


Subfolders sorted by number of images:
CELLULAR_PHONE_CASE: 63996 images
GROCERY: 3998 images
SHOES: 3129 images
HOME: 2513 images
CHAIR: 1717 images
HOME_FURNITURE_AND_DECOR: 1436 images
HOME_BED_AND_BATH: 1325 images
FINENECKLACEBRACELETANKLET: 1120 images
SOFA: 1000 images
HEALTH_PERSONAL_CARE: 969 images
FINEEARRING: 915 images
OFFICE_PRODUCTS: 857 images
TABLE: 837 images
PET_SUPPLIES: 815 images
ACCESSORY: 755 images
HANDBAG: 747 images
KITCHEN: 654 images
SPORTING_GOODS: 647 images
HARDWARE_HANDLE: 625 images
EARRING: 613 images
RUG: 591 images
LAMP: 543 images
LIGHT_BULB: 540 images
OUTDOOR_LIVING: 505 images
LIGHT_FIXTURE: 503 images
FINERING: 453 images
NECKLACE: 431 images
JANITORIAL_SUPPLY: 407 images
SUITCASE: 390 images
HAT: 352 images
TOOLS: 349 images
STOOL_SEATING: 344 images
OTTOMAN: 330 images
BEAUTY: 320 images
PORTABLE_ELECTRONIC_DEVICE_COVER: 310 images
FOOD_SERVICE_SUPPLY: 271 images
SHELF: 261 images
DRINKING_CUP: 257 images
PLANTER: 248 images
AUTO_ACCESSORY: 2

In [45]:
BASE_PROMPT = """
**Role:** You are an expert AI assistant tasked with generating high-quality data for a Visual Question Answering (VQA) dataset.

**Input:** You will be provided with an image.

**Task:** Analyze the input image thoroughly and generate between 5 and 10 diverse, visually grounded question-answer pairs based *solely* on the image content.

**Strict Constraints:**
1.  **Question Phrasing & Length:** Phrase questions clearly and naturally, like a human would ask. Aim for grammatical completeness while remaining relatively concise (ideally 4-12 words).
2.  **Answer Format:** Each answer MUST be strictly a SINGLE WORD (e.g., 'red', 'car', 'three', 'yes', 'no', 'left', 'running', 'outdoor', 'sunny', 'sign_text_if_one_word'). Do NOT use phrases or sentences. Convert numerical counts to digits (e.g., '3'). Use 'yes' or 'no' for presence questions. Ensure answers are lowercase unless the answer is derived from text in the image (like a proper noun or acronym).
3.  **Quantity:** Generate 5 to 10 unique Q&A pairs.
4.  **Highly Important** Limit yes/no questions to **no more than 2 per image.  
5.  **Formatting for Parsing:**
    *   Place each Question-Answer pair on a **new line**.
    *   Separate the Question from its Answer using the exact separator sequence: ` ### ` (space, hash, hash, hash, space).
    *   **Crucially, do not include this separator sequence anywhere else** in the question or the answer itself.
    *   *Example of correct format:*
        ```
        What color is the main vehicle? ### red
        How many wheels are visible? ### 4
        Is the road wet? ### no
        ```
**Quality & Diversity Guidelines (Crucial):**
*   **Visual Grounding:** Absolutely critical. Both the question and the answer must be directly derivable from visible elements or obvious properties within the image. Do NOT use external knowledge or make complex inferences not explicitly visible.
*   **Answerability & Clarity:** Questions must be unambiguous and have a clear, verifiable single-word answer within the image. If a clear single-word answer isn't possible for a concept, do not ask that question.
*   **Natural Phrasing:** Phrase questions as complete, natural-sounding questions where appropriate, rather than minimal fragments (e.g., prefer 'What color are the leaves?' over just 'Leaf color?'). Ensure the question still logically leads to a single-word answer.
*   **Diversity:** Generate a mix of question types to test different visual understanding skills, *while adhering to the single-word answer constraint*. Aim to include several from this list where applicable and answerable with one word:
    *   **Object Identification**
    *   **Object Presence (Yes/No)**
    *   **Attribute Identification**
    *   **Counting**
    *   **Simple Spatial Relation/Location**
    *   **Scene Type/Setting**
    *   **Simple Action/Activity**
    *   **Text Recognition (OCR)**
"""

In [7]:
def prepare_prompt_from_image_path(image_path):
    image_path = os.path.abspath(image_path)
    image_filename = os.path.basename(image_path)

    # Get category name from parent directory
    category_dir = os.path.dirname(image_path)
    category_name = os.path.basename(category_dir)
    csv_path = os.path.join(category_dir, f"{category_name}.csv")

    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Expected CSV file {category_name}.csv not found in: {category_dir}")

    # Load CSV and find row with matching 'path' column
    df = pd.read_csv(csv_path)
    matched_row = df[df['path'].str.endswith(image_filename)]

    if matched_row.empty:
        raise ValueError(f"No matching metadata row found for image: {image_filename}")

    row_data = matched_row.iloc[0]
    metadata_str = " | ".join(f"{col}={row_data[col]}" for col in df.columns if pd.notna(row_data[col]))

    full_prompt = BASE_PROMPT.strip() + f"\n\n**Image Metadata:** {metadata_str}\n\n**Instruction:** Now, analyze the provided image carefully and generate 5-10 Q&A pairs following ALL the above instructions precisely. Use the ` ### ` separator exactly as specified. Ensure questions sound relatively natural and every answer is strictly one word and directly verifiable from the image."

    return full_prompt


def generate_questions_for_image(image_path):
    """
    Generates Q&A pairs from the image using the AI API.
    """
    with open(image_path, 'rb') as f:
        image_bytes = f.read()

    prompt = prepare_prompt_from_image_path(image_path)
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=[
            types.Part.from_bytes(
                data=image_bytes,
                mime_type='image/jpeg',
            ), prompt]
    )
    return response.text



def process_category_images(category_dir, output_csv):
    """
    Processes images in a category directory and generates questions using the AI API.
    Saves results with separate question and answer columns.
    """
    # Check if the output CSV already exists
    if os.path.exists(output_csv):
        df_existing = pd.read_csv(output_csv)
        processed_images = df_existing['image_path'].tolist()
    else:
        df_existing = None
        processed_images = []

    # Get list of all image paths in the category
    image_paths = []
    for subdir, _, files in os.walk(category_dir):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png')):
                image_paths.append(os.path.join(subdir, file))

    # Limit to 4000 images per category
    image_paths = image_paths[:4000]

    # Define column headers
    question_cols = [f"question_{i}" for i in range(1, 11)]
    answer_cols = [f"answer_{i}" for i in range(1, 11)]
    fieldnames = ['image_path'] + question_cols + answer_cols

    # Open CSV for appending
    with open(output_csv, 'a', newline='') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)

        if df_existing is None:
            writer.writeheader()  # Write header only if CSV doesn't exist

        for img_path in tqdm(image_paths, desc=f"Processing {category_dir}", ncols=100):
            if img_path in processed_images:
                continue

            try:
                response_text = generate_questions_for_image(img_path)

                qa_lines = [line.strip() for line in response_text.strip().split('\n') if '###' in line]
                qa_pairs = [line.split(' ### ') for line in qa_lines if len(line.split(' ### ')) == 2]

                row = {'image_path': img_path}
                for i, (q, a) in enumerate(qa_pairs[:10]):  # Max 10 Q&A
                    row[f'question_{i+1}'] = q.strip()
                    row[f'answer_{i+1}'] = a.strip()

                writer.writerow(row)
                time.sleep(4)  # Rate limit

            except Exception as e:
                print(f"Error processing {img_path}: {e}")


In [77]:
category_dir = '/Users/aryanrastogi/college/vr/vr_major/categorized_data/RING'  # Change this to the category directory
output_csv = '/Users/aryanrastogi/college/vr/vr_major/VR_QA_data3.csv'  # Output file for Q&A data
process_category_images(category_dir, output_csv)

Processing /Users/aryanrastogi/college/vr/vr_major/categorized_data/RING: 100%|█| 122/122 [00:19<00:


In [37]:
df = pd.read_csv("VR_QA_data2.csv")
df.shape

(1501, 21)

In [None]:
DESK: 162 images   --DONE
VITAMIN: 152 images  --DONE
COFFEE: 152 images --DONE
COMPUTER_ADD_ON: 151 images  --DONE

BOOT: 125 images  --DONE
ABIS_DRUGSTORE: 123 images  --DONE
CLOCK: 123 images --DONE
RING: 122 images -- DONE