In [1]:
import os
import shutil
import cv2
import hashlib
from pathlib import Path

from PIL import Image
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import kagglehub


In [2]:
# Project structure (educational style)
PROJECT_ROOT = Path(".")
DATA_DIR = PROJECT_ROOT / "data" / "leapgestrecog"

RAW_DIR = DATA_DIR / "raw"
CLEAN_DIR = DATA_DIR / "clean"

IMG_SIZE = (224, 224)
BATCH_SIZE = 32
VAL_SPLIT = 0.2

RAW_DIR.mkdir(parents=True, exist_ok=True)



In [3]:
print("Downloading dataset...")
path = kagglehub.dataset_download("gti-upm/leapgestrecog")
print("Downloaded to:", path)

# Dataset usually contains 'leapGestRecog'
if os.path.isdir(os.path.join(path, "leapGestRecog")):
    src = os.path.join(path, "leapGestRecog")
else:
    src = path

print("Using source:", src)


Downloading dataset...
Using Colab cache for faster access to the 'leapgestrecog' dataset.
Downloaded to: /kaggle/input/leapgestrecog
Using source: /kaggle/input/leapgestrecog/leapGestRecog


In [4]:
# Copy raw data once
if not any(RAW_DIR.iterdir()):
    shutil.copytree(src, RAW_DIR, dirs_exist_ok=True)
    print("Raw data copied")

# Create clean data copy
if not CLEAN_DIR.exists():
    shutil.copytree(RAW_DIR, CLEAN_DIR)
    print("Clean data folder created")
else:
    print("Clean data already exists")


Raw data copied
Clean data folder created


In [12]:
def remove_corrupted(folder: Path):
    print("\n Removing corrupted images...")
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue

        for file in os.listdir(cls_path):
            img_path = cls_path / file

            if not img_path.is_file():
                continue

            try:
                img = cv2.imread(str(img_path))
                if img is None:
                    img_path.unlink()
            except:
                img_path.unlink()


In [18]:
def fix_format(folder: Path, size=(224, 224)):
    print("\nStandardizing images...")
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue

        for file in os.listdir(cls_path):
            img_path = cls_path / file

            if not img_path.is_file():
                continue

            try:
                img = Image.open(img_path).convert("RGB")
                img = img.resize(size)
                img.save(img_path)
            except:
                img_path.unlink()


In [19]:
def remove_duplicates(folder: Path):
    print("\nRemoving duplicates...")
    seen = set()

    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue

        for file in os.listdir(cls_path):
            img_path = cls_path / file

            if not img_path.is_file():
                continue

            with open(img_path, "rb") as f:
                h = hashlib.md5(f.read()).hexdigest()

            if h in seen:
                img_path.unlink()
            else:
                seen.add(h)


In [20]:
def denoise(folder: Path):
    print("\nDenoising images...")
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue

        for file in os.listdir(cls_path):
            img_path = cls_path / file

            if not img_path.is_file():
                continue

            try:
                img = cv2.imread(str(img_path))
                img = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 21)
                cv2.imwrite(str(img_path), img)
            except:
                pass


In [21]:
def show_balance(folder: Path):
    """Show number of images per class."""
    print("\nClass Balance:")
    for cls in sorted(os.listdir(folder)):
        cls_path = folder / cls
        if cls_path.is_dir():
            print(f"{cls}: {len(os.listdir(cls_path))}")

In [22]:
# Run cleaning pipeline
remove_corrupted(CLEAN_DIR)
fix_format(CLEAN_DIR, IMG_SIZE)
remove_duplicates(CLEAN_DIR)
denoise(CLEAN_DIR)
show_balance(CLEAN_DIR)



 Removing corrupted images...


100%|██████████| 10/10 [00:00<00:00, 2158.01it/s]



Standardizing images...


100%|██████████| 10/10 [00:00<00:00, 2330.69it/s]



Removing duplicates...


100%|██████████| 10/10 [00:00<00:00, 2201.16it/s]



Denoising images...


100%|██████████| 10/10 [00:00<00:00, 2776.58it/s]


Class Balance:
00: 10
01: 10
02: 10
03: 10
04: 10
05: 10
06: 10
07: 10
08: 10
09: 10



