In [None]:
import os
import shutil
import cv2
import hashlib
from pathlib import Path

from PIL import Image
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import kagglehub


In [None]:
# Project structure (educational style)
PROJECT_ROOT = Path(".")
DATA_DIR = PROJECT_ROOT / "data" / "leapgestrecog"

RAW_DIR = DATA_DIR / "raw"
CLEAN_DIR = DATA_DIR / "clean"

IMG_SIZE = (224, 224)
BATCH_SIZE = 32
VAL_SPLIT = 0.2

RAW_DIR.mkdir(parents=True, exist_ok=True)



In [None]:
print("üì• Downloading dataset...")
path = kagglehub.dataset_download("gti-upm/leapgestrecog")
print("Downloaded to:", path)

# Dataset usually contains 'leapGestRecog'
if os.path.isdir(os.path.join(path, "leapGestRecog")):
    src = os.path.join(path, "leapGestRecog")
else:
    src = path

print("Using source:", src)


In [None]:
# Copy raw data once
if not any(RAW_DIR.iterdir()):
    shutil.copytree(src, RAW_DIR, dirs_exist_ok=True)
    print("‚úÖ Raw data copied")

# Create clean data copy
if not CLEAN_DIR.exists():
    shutil.copytree(RAW_DIR, CLEAN_DIR)
    print("‚úÖ Clean data folder created")
else:
    print("‚ÑπÔ∏è Clean data already exists")


In [None]:
def remove_corrupted(folder: Path):
    """Remove images that cannot be read."""
    print("\nüßπ Removing corrupted images...")
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue
        for file in os.listdir(cls_path):
            img_path = cls_path / file
            try:
                img = cv2.imread(str(img_path))
                if img is None:
                    img_path.unlink(missing_ok=True)
            except:
                img_path.unlink(missing_ok=True)

In [None]:
def fix_format(folder: Path, size=(224, 224)):
    """Convert images to RGB and resize."""
    print("\nüé® Standardizing images...")
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue
        for file in os.listdir(cls_path):
            img_path = cls_path / file
            try:
                img = Image.open(img_path).convert("RGB")
                img = img.resize(size)
                img.save(img_path)
            except:
                img_path.unlink(missing_ok=True)

In [None]:
def remove_duplicates(folder: Path):
    """Remove duplicate images using MD5 hash."""
    print("\nüóëÔ∏è Removing duplicates...")
    seen = set()
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue
        for file in os.listdir(cls_path):
            img_path = cls_path / file
            if img_path.is_dir():
                continue
            with open(img_path, "rb") as f:
                h = hashlib.md5(f.read()).hexdigest()
            if h in seen:
                img_path.unlink(missing_ok=True)
            else:
                seen.add(h)

In [None]:
def denoise(folder: Path):
    """Apply denoising filter."""
    print("\n‚ú® Denoising images...")
    for cls in tqdm(os.listdir(folder)):
        cls_path = folder / cls
        if not cls_path.is_dir():
            continue
        for file in os.listdir(cls_path):
            img_path = cls_path / file
            if img_path.is_dir():
                continue
            try:
                img = cv2.imread(str(img_path))
                img = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 21)
                cv2.imwrite(str(img_path), img)
            except:
                pass

In [None]:
def show_balance(folder: Path):
    """Show number of images per class."""
    print("\nüìä Class Balance:")
    for cls in sorted(os.listdir(folder)):
        cls_path = folder / cls
        if cls_path.is_dir():
            print(f"{cls}: {len(os.listdir(cls_path))}")

In [None]:
# Run cleaning pipeline
remove_corrupted(CLEAN_DIR)
fix_format(CLEAN_DIR, IMG_SIZE)
remove_duplicates(CLEAN_DIR)
denoise(CLEAN_DIR)
show_balance(CLEAN_DIR)
