In [1]:
import os
import json
from tqdm import tqdm
from PIL import Image

def find_missing_or_invalid_images(json_path: str, base_path: str) -> list:
    """
    Checks if image files listed in a JSON file exist in the given base path.

    Args:
        json_path (str): Path to the JSON file.
        base_path (str): Base directory to be prepended to the "image" path in each item.

    Returns:
        list: A list of relative image paths that are missing.
    """
    missing_or_invalid_images = []
    new_data = []

    # Load the JSON data
    if json_path.endswith(".json"):
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    if json_path.endswith(".jsonl"):
        with open(json_path, "r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f]
    print(f"Loaded {len(data)} items from {json_path}")
    # Check each image path
    for item in tqdm(data, total=len(data), desc="Checking images"):
        image_relative_path = item.get("image", "")
        if not image_relative_path:
            continue
        image_full_path = os.path.join(base_path, image_relative_path)
 
        if not os.path.exists(image_full_path):
            missing_or_invalid_images.append(image_relative_path)
            continue
        #continue
        # Try to open and verify the image
        try:
            with Image.open(image_full_path) as img:
                img.verify()  # verify doesn't load full image but checks integrity
                new_data.append(item)
                
            
        except Exception as e:
            #print(f"Invalid image {image_relative_path}: {e}")
            missing_or_invalid_images.append(image_relative_path)
            
        # Copy image to destination path


    return missing_or_invalid_images, new_data


In [2]:
json_path = "/home/aikusrv04/aiku/small_korean_vlm/data/korean_food/llava_food_dataset_sample.json"
base_directory = "/home/aikusrv04/aiku/small_korean_vlm/data"
missing_or_corrupt = find_missing_or_invalid_images(json_path, base_directory)
if missing_or_corrupt:
    print("Missing or invalid images:")
    for img in missing_or_corrupt:
        print(img)
    print(f"Total missing or invalid images: {len(missing_or_corrupt)}")
else:
    print("All images exist and are valid.")


Loaded 300 items from /home/aikusrv04/aiku/small_korean_vlm/data/korean_food/llava_food_dataset_sample.json


Checking images: 100%|██████████| 300/300 [00:00<00:00, 5278.02it/s]

All images exist and are valid.





In [56]:
json_path = "/home/aikusrv04/aiku/small_korean_vlm/data/korean_object/korean_object_captioning.json"
base_directory = "/home/aikusrv04/aiku/small_korean_vlm/data"
missing_or_corrupt, new_data = find_missing_or_invalid_images(json_path, base_directory)
if missing_or_corrupt:
    print("Missing or invalid images:")
    for img in missing_or_corrupt:
        print(img)
    print(f"Total missing or invalid images: {len(missing_or_corrupt)}")
    print(f"Total valid items: {len(new_data)}")
else:
    print("All images exist and are valid.")


Loaded 2364 items from /home/aikusrv04/aiku/small_korean_vlm/data/korean_object/korean_object_captioning.json


Checking images: 100%|██████████| 2364/2364 [00:00<00:00, 3314.53it/s]

All images exist and are valid.





In [57]:
json_path = "/home/aikusrv04/aiku/small_korean_vlm/data/korean_ocr_education/korean_ocr_education_sampled.json"
base_directory = "/home/aikusrv04/aiku/small_korean_vlm/data"
missing_or_corrupt, new_data = find_missing_or_invalid_images(json_path, base_directory)
if missing_or_corrupt:
    print("Missing or invalid images:")
    #for img in missing_or_corrupt:
    #    print(img)
    print(f"Total missing or invalid images: {len(missing_or_corrupt)}")
    print(f"Total valid items: {len(new_data)}")
else:
    print("All images exist and are valid.")


Loaded 51485 items from /home/aikusrv04/aiku/small_korean_vlm/data/korean_ocr_education/korean_ocr_education_sampled.json


Checking images: 100%|██████████| 51485/51485 [04:41<00:00, 182.97it/s]

All images exist and are valid.





In [52]:
json_path = "/home/aikusrv04/aiku/small_korean_vlm/data/korean_visualization_qa/korean_visualization_qa_train.json"
base_directory = "/home/aikusrv04/aiku/small_korean_vlm/data"
missing_or_corrupt, new_data = find_missing_or_invalid_images(json_path, base_directory)
if missing_or_corrupt:
    print("Missing or invalid images:")
    #for img in missing_or_corrupt:
    #    print(img)
    print(f"Total missing or invalid images: {len(missing_or_corrupt)}")
    print(f"Total valid items: {len(new_data)}")
else:
    print("All images exist and are valid.")


Loaded 28267 items from /home/aikusrv04/aiku/small_korean_vlm/data/korean_visualization_qa/korean_visualization_qa_train.json


Checking images: 100%|██████████| 28267/28267 [00:09<00:00, 2905.04it/s]

All images exist and are valid.





In [53]:
json_path = "/home/aikusrv04/aiku/small_korean_vlm/data/korean_image/korean_image_single_conversation.json"
base_directory = "/home/aikusrv04/aiku/small_korean_vlm/data"
missing_or_corrupt, new_data = find_missing_or_invalid_images(json_path, base_directory)
if missing_or_corrupt:
    print("Missing or invalid images:")
    #for img in missing_or_corrupt:
    #    print(img)
    print(f"Total missing or invalid images: {len(missing_or_corrupt)}")
    print(f"Total valid items: {len(new_data)}")
else:
    print("All images exist and are valid.")


Loaded 22765 items from /home/aikusrv04/aiku/small_korean_vlm/data/korean_image/korean_image_single_conversation.json


Checking images: 100%|██████████| 22765/22765 [00:02<00:00, 8734.84it/s]

All images exist and are valid.





In [9]:
json_path = "/home/aikusrv04/aiku/small_korean_vlm/data/korean_text_in_the_wild_ocr/text_in_the_wild_final.json"
base_directory = "/home/aikusrv04/aiku/small_korean_vlm/data"
missing_or_corrupt, new_data = find_missing_or_invalid_images(json_path, base_directory)
if missing_or_corrupt:
    print("Missing or invalid images:")
    #for img in missing_or_corrupt:
    #    print(img)
    print(f"Total missing or invalid images: {len(missing_or_corrupt)}")
    print(f"Total valid items: {len(new_data)}")
else:
    print("All images exist and are valid.")


Loaded 98726 items from /home/aikusrv04/aiku/small_korean_vlm/data/korean_text_in_the_wild_ocr/text_in_the_wild_final.json


Checking images: 100%|██████████| 98726/98726 [00:22<00:00, 4346.03it/s]

All images exist and are valid.





In [51]:
with open("/home/aikusrv04/aiku/small_korean_vlm/data/korean_visualization_qa/korean_visualization_qa_train.json", "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=4)