# README

### dataset folder refactor

1. train / val / test set의 비율 정해주면,  

2. 아래 폴더구조에서
  ```
  dataset/
  ├── class1/
  ├── class2/
  └── class3/
  ```

3. 아래 폴더구조로 바꿔서 이미지들을 이동
  ```
  dataset/
  ├── train/
  │   ├── class1/
  │   ├── class2/
  │   └── class3/
  ├── val/
  │   ├── class1/
  │   ├── class2/
  │   └── class3/
  └── test/
      ├── class1/
      ├── class2/
      └── class3/
  ```


# CODE

In [1]:
import os
import shutil
from glob import glob
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
def print_folder_structure_and_counts(base_path):
    for root, dirs, files in os.walk(base_path):
        level = root.replace(base_path, '').count(os.sep)
        indent = ' ' * 4 * level
        print(f"{indent}{os.path.basename(root)}/ - {len(files)} files")

ver 1 : copy S->D

In [3]:
def split_data_copy(source_folder, target_folder, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
  classes = [d for d in os.listdir(source_folder) if os.path.isdir(os.path.join(source_folder, d))]
  sets = ['train', 'val', 'test']
  ratios = [train_ratio, val_ratio, test_ratio]

  if train_ratio+val_ratio+test_ratio != 1:
    assert ValueError("split ratio error")

  for set_name in sets:
    for cls in classes:
      os.makedirs(os.path.join(target_folder, set_name, cls), exist_ok=True)

  for cls in classes:
    files = glob(os.path.join(source_folder, cls, '*'))
    np.random.shuffle(files)
    split_a = int(len(files) * ratios[0])
    split_b = split_a + int(len(files) * ratios[1])

    for file, set_name in zip(files, np.repeat(sets, [split_a, split_b - split_a, len(files) - split_b])):
      shutil.copy(file, os.path.join(target_folder, set_name, cls, os.path.basename(file)))

In [4]:
source_dir = '/content/drive/MyDrive/datasets_storage/test'
target_dir = '/content/drive/MyDrive/dataset'

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

split_data_copy(source_dir, target_dir, train_ratio, val_ratio, test_ratio)

In [11]:
print_folder_structure_and_counts(target_dir)

dataset/ - 0 files
    train/ - 0 files
        fresh/ - 343 files
        rotten/ - 343 files
    val/ - 0 files
        fresh/ - 73 files
        rotten/ - 73 files
    test/ - 0 files
        fresh/ - 74 files
        rotten/ - 74 files


ver 2 : in place

In [None]:
def split_data_inplace(directory, classes, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    sets = ['train', 'val', 'test']
    ratios = [train_ratio, val_ratio, test_ratio]

    for set_name in sets:
        set_path = os.path.join(directory, set_name)
        os.makedirs(set_path, exist_ok=True)
        for cls in classes:
            os.makedirs(os.path.join(set_path, cls), exist_ok=True)

    for cls in classes:
        class_path = os.path.join(directory, cls)
        files = glob(os.path.join(class_path, '*'))
        np.random.shuffle(files)
        split_a = int(len(files) * ratios[0])
        split_b = split_a + int(len(files) * ratios[1])

        for file, set_name in zip(files, np.repeat(sets, [split_a, split_b - split_a, len(files) - split_b])):
            shutil.move(file, os.path.join(directory, set_name, cls, os.path.basename(file)))

        os.rmdir(class_path)

In [None]:
directory = 'path/to/your/dataset'
classes = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

split_data_inplace(directory, classes, train_ratio, val_ratio, test_ratio)