In [None]:
#@title Config
import os
import json

# Google Drive mount
if not os.path.exists('/content/drive'):
    from google.colab import drive
    drive.mount('/content/drive')

DATASET_ROOT = '/content/drive/MyDrive/project/AEGIS/Dataset'
CATEGORIES = ['swoon', 'vandalism', 'assault', 'burglary', 'dump']
CLASS1_MAP = {'이상': 'abnormal', '의심': 'suspicious', '정상': 'normal'}
OUTPUT_PATH = os.path.join(DATASET_ROOT, 'dataset.json')

MAX_ENTRY = 0  #@param {type:"integer"}
# 0 = 제한 없음 (전체), 양수 = 해당 개수까지만 생성

In [None]:
#@title Run: Listing → Generate → Save JSON
from itertools import islice

def entry_generator():
    """폴더 구조를 순회하며 dataset entry를 yield하는 generator"""
    exclude_list = ['전체', 'ipynb_checkpoints', '.ipynb_checkpoints']

    for category in CATEGORIES:
        image_base = os.path.join(DATASET_ROOT, category, 'image')
        if not os.path.exists(image_base):
            print(f'[SKIP] Path not found: {image_base}')
            continue

        folders = sorted([
            f for f in os.listdir(image_base)
            if os.path.isdir(os.path.join(image_base, f)) and f not in exclude_list
        ])

        for folder_name in folders:
            folder_path = os.path.join(image_base, folder_name)

            for sub_name, class1_value in CLASS1_MAP.items():
                sub_path = os.path.join(folder_path, sub_name)
                if not os.path.exists(sub_path):
                    continue

                images = sorted([
                    f for f in os.listdir(sub_path)
                    if f.lower().endswith(('.jpg', '.jpeg', '.png'))
                ])

                if len(images) != 8:
                    if len(images) > 0:
                        print(f'[SKIP] {category}/{folder_name}/{sub_name} ({len(images)}장)')
                    continue

                yield {
                    'images': images,
                    'class1': class1_value,
                    'class2': category,
                    'summary': ''
                }

# Generator에서 MAX_ENTRY만큼만 소비
if MAX_ENTRY > 0:
    dataset = list(islice(entry_generator(), MAX_ENTRY))
else:
    dataset = list(entry_generator())

# --- Report ---
print(f'\nGenerated entries: {len(dataset)}' + (f' (MAX_ENTRY={MAX_ENTRY})' if MAX_ENTRY > 0 else ' (all)'))
print(f'  - abnormal: {sum(1 for d in dataset if d["class1"] == "abnormal")}')
print(f'  - suspicious: {sum(1 for d in dataset if d["class1"] == "suspicious")}')
print(f'  - normal: {sum(1 for d in dataset if d["class1"] == "normal")}')
for cat in CATEGORIES:
    cnt = sum(1 for d in dataset if d['class2'] == cat)
    if cnt > 0:
        print(f'  - {cat}: {cnt}')

# --- Save ---
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

print(f'\nJSON saved: {OUTPUT_PATH}')
print(f'File size: {os.path.getsize(OUTPUT_PATH) / 1024:.1f} KB')

# --- Preview ---
print('\n--- Preview (first 2 entries) ---')
print(json.dumps(dataset[:2], ensure_ascii=False, indent=2))