In [1]:
import os

# ✅ /workspace 아래로 Hugging Face 전체 루트 이동
os.environ["HF_HOME"] = "/workspace/.cache/huggingface"

# ✅ Hub(모델/토크나이저/데이터셋 repo snapshot) 캐시 위치
os.environ["HF_HUB_CACHE"] = "/workspace/.cache/huggingface/hub"

# ✅ datasets 라이브러리 Arrow 캐시 위치
os.environ["HF_DATASETS_CACHE"] = "/workspace/.cache/huggingface/datasets"

# (선택) 전처리 산출물 등 assets 캐시
os.environ["HF_ASSETS_CACHE"] = "/workspace/.cache/huggingface/assets"

In [None]:
#@title HF 로그인 (읽기 권한)
HF_READ_TOKEN = ""  #@param {type:"string"}
# Colab 비밀 저장소 사용 시: from google.colab import userdata; HF_READ_TOKEN = userdata.get('HF_TOKEN')

!hf auth login --token {HF_READ_TOKEN}

In [3]:
from datasets import load_dataset

dataset_repo = "AIX-01/aegis-datasets-no-summary-original-only"
dataset = load_dataset(dataset_repo)

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

In [4]:
dataset["train"][0]

{'image_names': ['100-1_cam01_swoon01_place02_day_spring_000194.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000195.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000196.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000197.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000198.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000199.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000200.jpg',
  '100-1_cam01_swoon01_place02_day_spring_000201.jpg'],
 'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>,
  <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x360>],
 'class1': 'a

In [5]:
from datasets import DatasetDict

# 1) 셔플 + 2) 정확히 150개를 test로 분리
splits = dataset["train"].shuffle(seed=42).train_test_split(test_size=150, seed=42)

# 3) train/test DatasetDict로 정리
ds_split = DatasetDict({
    "train": splits["train"],  # 3000
    "test": splits["test"],    # 150
})

In [6]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['image_names', 'images', 'class1', 'class2', 'summary'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['image_names', 'images', 'class1', 'class2', 'summary'],
        num_rows: 150
    })
})

In [7]:
"""
class1, class2 가능한 값 카운트하는 코드 추가
"""
from collections import Counter

# class1, class2 가능한 값
class1_labels = ["abnormal", "normal", "suspicious"]
class2_labels = ["assault", "burglary", "dump", "swoon", "vandalism"]

def print_label_counts(ds_split):
    for split_name in ["train", "test"]:
        ds = ds_split[split_name]

        # class1 / class2 전체 카운트
        c1_counts = Counter(ds["class1"])
        c2_counts = Counter(ds["class2"])

        print(f"===== {split_name} =====")
        print("[class1]")
        for lbl in class1_labels:
            print(f"  {lbl}: {c1_counts.get(lbl, 0)}")

        print("[class2]")
        for lbl in class2_labels:
            print(f"  {lbl}: {c2_counts.get(lbl, 0)}")
        print()

# 사용 예시
print_label_counts(ds_split)

===== train =====
[class1]
  abnormal: 998
  normal: 999
  suspicious: 1003
[class2]
  assault: 606
  burglary: 594
  dump: 600
  swoon: 602
  vandalism: 598

===== test =====
[class1]
  abnormal: 52
  normal: 51
  suspicious: 47
[class2]
  assault: 24
  burglary: 36
  dump: 30
  swoon: 28
  vandalism: 32



In [8]:
from huggingface_hub import login

# Hugging Face Hub 로그인
login(token="")

print('Hugging Face Hub에 성공적으로 로그인했습니다.')

push_repo_id = "AIX-01/aegis-datasets-no-summary-original-only-test-size-150"
ds_split.push_to_hub(push_repo_id, private=True)

Uploading the dataset shards:   0%|          | 0/19 [00:00<?, ? shards/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/AIX-01/aegis-datasets-no-summary-original-only-test-size-150/commit/e9dd3554881e99173fcfc69d929524b492ecc4be', commit_message='Upload dataset', commit_description='', oid='e9dd3554881e99173fcfc69d929524b492ecc4be', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AIX-01/aegis-datasets-no-summary-original-only-test-size-150', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AIX-01/aegis-datasets-no-summary-original-only-test-size-150'), pr_revision=None, pr_num=None)