# Dataset Preparation - File Renaming and Class Distribution

Notebook ini digunakan untuk merapikan struktur dataset YOLO dengan cara melakukan rename file gambar dan label agar konsisten dengan daftar kelas di data.yaml. Selain itu, notebook ini juga menghitung jumlah gambar per split (train, valid, test) serta distribusi instance per kelas. Hasil analisis ini membantu memastikan dataset siap digunakan untuk training model computer vision secara lebih terorganisir dan transparan.

# Import Libraries

In [None]:
import os
import yaml

# Rename File

In [None]:
# path dataset
base_dir = r"emotion-8"
yaml_path = os.path.join(base_dir, "data.yaml")

# load data.yaml
with open(yaml_path, "r") as f:
    data_yaml = yaml.safe_load(f)
class_names = data_yaml["names"]

# Renaming function
def rename_split(split):
    img_dir = os.path.join(base_dir, split, "images")
    label_dir = os.path.join(base_dir, split, "labels")

    for i, filename in enumerate(os.listdir(img_dir), start=1):
        if filename.lower().endswith((".jpg", ".jpeg", ".png")):
            base_old = os.path.splitext(filename)[0]
            old_img = os.path.join(img_dir, filename)
            old_lbl = os.path.join(label_dir, base_old + ".txt")

            prefix = "unknown"
            if os.path.exists(old_lbl):
                with open(old_lbl, "r") as f:
                    first_line = f.readline().strip().split()
                    if first_line:
                        cls_id = int(first_line[0])
                        prefix = class_names[cls_id]

            # new filename
            new_base = f"{prefix}_{i:04d}"
            new_img = os.path.join(img_dir, new_base + ".jpg")
            new_lbl = os.path.join(label_dir, new_base + ".txt")

            os.rename(old_img, new_img)
            if os.path.exists(old_lbl):
                os.rename(old_lbl, new_lbl)

            print(f"{filename} → {new_base}.jpg (+label)")

# Process all splits
for split in ["train", "valid", "test"]:
    rename_split(split)

-2-_jpg.rf.6767a85fde7c4b976956444a092687f2.jpg → romantic_0001.jpg (+label)
-2-_jpg.rf.82d41f1b92382490fe80ee818e58d37c.jpg → romantic_0002.jpg (+label)
-3-_jpg.rf.fe4ec07893bcba63f0a4722a82407ad7.jpg → romantic_0003.jpg (+label)
-4-_jpg.rf.3546e338f8c0fe5b38c0591ddd5cc183.jpg → romantic_0004.jpg (+label)
-5-_jpg.rf.5321b35a8854bb847cf61dd7c5a5d02a.jpg → romantic_0005.jpg (+label)
-7-_jpg.rf.fbc4f73b2a51fe690e9777503f9efa90.jpg → romantic_0006.jpg (+label)
-unhappy-miss-good-chance-dressed-casually-isolated-yellow-wall_273609-37534_jpg.rf.356635b7d72caf21f462326fcfcab8ee.jpg → unknown_0007.jpg (+label)
-wallpaper-preview_jpg.rf.4f854c6c907a39d29bdf29b6b805990c.jpg → rock_0008.jpg (+label)
-_jpg.rf.9ae60c8b43c4346a18d69720601d61e4.jpg → romantic_0009.jpg (+label)
000000_jpg.rf.d426d2cc73c2e5f253ddb64aae969c0d.jpg → romantic_0010.jpg (+label)
00b19536c4709b2f4b20389a8f6943ee84a57fc95371cc28c9e61146_jpg.rf.fbc7afefb05d0e1c204c254a926403fd.jpg → unknown_0011.jpg (+label)
00ba443d2e8b93220

In [11]:
# Counting function
def count_images(split):
    img_dir = os.path.join(base_dir, split, "images")
    img_count = sum(
        1 for file in os.listdir(img_dir)
        if file.lower().endswith((".jpg", ".jpeg", ".png"))
    )
    print(f"{split.upper()}")
    print(f"Total gambar: {img_count}")
    return img_count

# Count images in all splits
total = 0
for split in ["train", "valid", "test"]:
    total += count_images(split)

print("\nTOTAL KESELURUHAN")
print(f"Total semua gambar: {total}")

TRAIN
Total gambar: 1427
VALID
Total gambar: 225
TEST
Total gambar: 183

TOTAL KESELURUHAN
Total semua gambar: 1835
