The initial and processed datasets below can be found on kaggle:
- robin-lite-raw https://kaggle.com/datasets/1cf97eb6d7b07e1b05fb9795c904d444476223eba6f53a4ab1031e6296d07193
- robin-base https://www.kaggle.com/datasets/bahiskaraananda/robin-base 

In [7]:
import os
import shutil
import random
from PIL import Image
from tqdm import tqdm
import numpy as np

In [8]:
raw_data_path = '/kaggle/input/robin-lite/robin-lite-raw'
output_path = '/kaggle/working/base-robin'

os.makedirs(output_path, exist_ok=True)
target_size = (224, 224)
samples_per_class = 3000  #downsampling

# Data preprocessing and downsampling

In [9]:
def preprocess_and_save_image(image_path, output_dir, target_size):
    try:
        img = Image.open(image_path)
        
        if img.mode != 'RGB':
            img = img.convert('RGB')
        img = img.resize(target_size)
        img.save(os.path.join(output_dir, os.path.basename(image_path)), format='JPEG')
        
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

for class_name in os.listdir(raw_data_path):
    class_dir = os.path.join(raw_data_path, class_name)
    output_class_dir = os.path.join(output_path, class_name)
    os.makedirs(output_class_dir, exist_ok=True)
    image_paths = [os.path.join(class_dir, img) for img in os.listdir(class_dir) if img.endswith(('png', 'jpg', 'jpeg'))]
    
    if len(image_paths) > samples_per_class:
        image_paths = random.sample(image_paths, samples_per_class)
    for img_path in tqdm(image_paths, desc=f"Processing {class_name}"):
        preprocess_and_save_image(img_path, output_class_dir, target_size)

for class_name in os.listdir(output_path):
    class_dir = os.path.join(output_path, class_name)
    print(f"Class {class_name}: {len(os.listdir(class_dir))} images")

shutil.make_archive('/kaggle/working/processed_robin_lite', 'zip', output_path)
print("Preprocessed dataset zipped and ready for upload!")

Processing elektronik: 100%|██████████| 3000/3000 [00:25<00:00, 117.03it/s]
Processing logam: 100%|██████████| 3000/3000 [00:28<00:00, 104.24it/s]
Processing plastik: 100%|██████████| 3000/3000 [00:29<00:00, 101.44it/s]
Processing buah_sayuran: 100%|██████████| 3000/3000 [00:20<00:00, 145.46it/s]
Processing makanan: 100%|██████████| 3000/3000 [00:20<00:00, 145.15it/s]
Processing daun: 100%|██████████| 3000/3000 [00:30<00:00, 98.50it/s] 
Processing tekstil: 100%|██████████| 3000/3000 [00:40<00:00, 73.83it/s]
Processing medis: 100%|██████████| 3000/3000 [00:30<00:00, 98.73it/s] 
Processing kaca: 100%|██████████| 3000/3000 [00:24<00:00, 122.41it/s]
Processing kertas: 100%|██████████| 3000/3000 [00:22<00:00, 133.49it/s]


Class kaca: 3000 images
Class buah_sayuran: 3000 images
Class elektronik: 3000 images
Class plastik: 3000 images
Class medis: 3000 images
Class daun: 3000 images
Class tekstil: 3000 images
Class logam: 3000 images
Class makanan: 3000 images
Class kertas: 3000 images
Preprocessed dataset zipped and ready for upload!


In [10]:
from sklearn.model_selection import train_test_split

input_path = "/kaggle/input/robin-lite-v1/processed_robin_lite"
output_path = "/kaggle/working/robin-base/"

train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

for split in ['train', 'val', 'test']:
    split_path = os.path.join(output_path, split)
    os.makedirs(split_path, exist_ok=True)

for class_name in os.listdir(input_path):
    class_path = os.path.join(input_path, class_name)
    if os.path.isdir(class_path):
        images = [os.path.join(class_path, img) for img in os.listdir(class_path) if img.endswith(('.jpg', '.png'))]
        
        train_files, temp_files = train_test_split(images, test_size=(val_ratio + test_ratio), random_state=42)
        val_files, test_files = train_test_split(temp_files, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)
        
        for split, files in zip(['train', 'val', 'test'], [train_files, val_files, test_files]):
            class_output_dir = os.path.join(output_path, split, class_name)
            os.makedirs(class_output_dir, exist_ok=True)
            for file in files:
                shutil.copy(file, class_output_dir)

print("Dataset split completed!")

Dataset split completed!


# Dataset splitting

In [11]:
for split in ['train', 'val', 'test']:
    split_path = os.path.join(output_path, split)
    print(f"{split} set:")
    for class_name in os.listdir(split_path):
        class_dir = os.path.join(split_path, class_name)
        print(f"  {class_name}: {len(os.listdir(class_dir))} images")

train set:
  kaca: 2099 images
  buah_sayuran: 2099 images
  elektronik: 2099 images
  plastik: 2099 images
  medis: 2099 images
  daun: 2099 images
  tekstil: 2099 images
  logam: 2099 images
  makanan: 2099 images
  kertas: 2099 images
val set:
  kaca: 600 images
  buah_sayuran: 600 images
  elektronik: 600 images
  plastik: 600 images
  medis: 600 images
  daun: 600 images
  tekstil: 600 images
  logam: 600 images
  makanan: 600 images
  kertas: 600 images
test set:
  kaca: 301 images
  buah_sayuran: 301 images
  elektronik: 301 images
  plastik: 301 images
  medis: 301 images
  daun: 301 images
  tekstil: 301 images
  logam: 301 images
  makanan: 301 images
  kertas: 301 images


In [None]:
output_path = "/kaggle/working/"
shutil.make_archive('/kaggle/working/robin-base', 'zip', output_path)