In [13]:
import os
import zipfile
import shutil
import numpy as np
from tqdm import tqdm
from PIL import Image

In [15]:
DATASET_ZIP = 'leaf-images.zip'
TARGET_SIZE = (128, 128)
current_dir = os.getcwd()
RAW_DIR = os.path.join(current_dir, "content", "data", "raw")
CLEAN_DIR = os.path.join(current_dir, "content", "data", "clean")

In [17]:
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(CLEAN_DIR, exist_ok=True)

if you are using google colab, you would need to upload your kaggle.json file

In [None]:
from google.colab import files
uploaded = files.upload()
os.makedirs('/root/.kaggle', exist_ok=True)
shutil.move('kaggle.json', '/root/.kaggle/kaggle.json')
os.chmod('/root/.kaggle/kaggle.json', 0o600)

Or if you wanna load the dataset locally, implement as this:

In [25]:
from pathlib import Path
if not os.path.exists(str(Path.home() / '.kaggle/kaggle.json')):
    print("Please upload your kaggle.json to ~/.kaggle/")
else:
    os.chmod(str(Path.home() / '.kaggle/kaggle.json'), 0o600)

In [None]:
!kaggle datasets download -d ichhadhari/leaf-images -p {RAW_DIR}

In [None]:
with zipfile.ZipFile(os.path.join(RAW_DIR, DATASET_ZIP), 'r') as zip_ref:
    zip_ref.extractall(RAW_DIR)

In [None]:
INPUT_IMG_DIR = os.path.join(RAW_DIR, 'leaf-images')

image_extensions = ('.jpg', '.jpeg', '.png', '.bmp')

img_files = [f for f in os.listdir(INPUT_IMG_DIR) if f.lower().endswith(image_extensions)]

print(f"Found {len(img_files)} images")

for idx, fname in tqdm(enumerate(img_files), total=len(img_files)):
    img_path = os.path.join(INPUT_IMG_DIR, fname)
    try:
        img = Image.open(img_path).convert("RGB")
        img_resized = img.resize(TARGET_SIZE)
        clean_filename = f"img_{idx:04d}.png"
        img_resized.save(os.path.join(CLEAN_DIR, clean_filename))
    except Exception as e:
        print(f"❌ Failed to process {fname}: {e}")

print(f"\nNormalized and saved {len(img_files)} images to: {CLEAN_DIR}")