In [27]:
# ===============================
# Dataset Split (Memory Safe)
# ===============================

import os
import random

# Dataset root
BASE_PATH = "/kaggle/input"
DATASET_PATH = os.path.join(BASE_PATH, os.listdir(BASE_PATH)[0])
DATA_ROOT = os.path.join(DATASET_PATH, os.listdir(DATASET_PATH)[0])

# Ratios
TRAIN_RATIO = 0.7
TEST_RATIO = 0.2
EVAL_RATIO = 0.1

random.seed(42)

# Storage
train, test, eval_ = [], [], []

# Create splits (NO file writing)
for cls in os.listdir(DATA_ROOT):
    class_path = os.path.join(DATA_ROOT, cls)
    if not os.path.isdir(class_path):
        continue

    images = os.listdir(class_path)
    random.shuffle(images)

    n = len(images)
    t1 = int(n * TRAIN_RATIO)
    t2 = int(n * (TRAIN_RATIO + TEST_RATIO))

    train.extend([(cls, img) for img in images[:t1]])
    test.extend([(cls, img) for img in images[t1:t2]])
    eval_.extend([(cls, img) for img in images[t2:]])

# Summary output
print("✅ Dataset split completed (in-memory only)")
print(f"Train samples: {len(train)}")
print(f"Test samples : {len(test)}")
print(f"Eval samples : {len(eval_)}")

# Optional preview
print("\nSample training entries:")
print(train[:5])

✅ Dataset split completed (in-memory only)
Train samples: 8974
Test samples : 2563
Eval samples : 1287

Sample training entries:
[('planet', 'planet_page_11_image_16_3_SwinIR_large.png'), ('planet', 'planet_page_3_image_8_1_SwinIR_large.png'), ('planet', 'planet_page_8_image_6_aug2_SwinIR_large.png'), ('planet', 'planet_page_20_image_8_SwinIR_large.png'), ('planet', 'planet_page_11_image_20_0_SwinIR_large.png')]


### Dataset Splitting Summary

- The dataset was split into:
  - 70% training
  - 20% testing
  - 10% evaluation

- Each class is preserved across all splits.
- This structure ensures fair training and unbiased evaluation.
- The implementation is reusable and adaptable for any dataset.