In [None]:
# Install required packages
!pip install rembg onnxruntime pillow numpy opencv-python albumentations tqdm psutil

import os
import cv2
import numpy as np
from tqdm import tqdm
import albumentations as A
import random
from PIL import Image
import shutil
import zipfile  # Add zipfile module for creating zip archives
import gc
import psutil  # For monitoring memory usage

# Import rembg after installation
from rembg import remove, new_session

# Input and output paths
input_dir = "/kaggle/input/indian-sign-language-self-creation/isl_dataset"
output_dir = "/kaggle/working/augmented_isl_dataset"
background_dir = "/kaggle/input/house-rooms-streets-image-dataset/kaggle_room_street_data/house_data"
temp_dir = "/kaggle/working/temp"

# Function to clean memory
def clean_memory():
    """Force garbage collection and report memory usage"""
    gc.collect()
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB")

# Function to check available disk space
def check_disk_space(min_required_mb=500):
    """Check if there's enough disk space available"""
    disk_usage = shutil.disk_usage("/kaggle/working")
    available_mb = disk_usage.free / (1024 * 1024)
    print(f"Available disk space: {available_mb:.2f} MB")
    return available_mb >= min_required_mb

# Create output directories
os.makedirs(output_dir, exist_ok=True)
os.makedirs(temp_dir, exist_ok=True)

# Load human segmentation model from rembg
print("Loading segmentation model...")
# You can try different models: "u2net_human_seg" (human-specific) or "u2net" (general)
model_name = "u2net_human_seg"  # Human-specific segmentation model
# model_name = "birefnet-portrait"
session = new_session(model_name)
print(f"Using segmentation model: {model_name}")

# Function to load background images
def load_background_images(bg_dir, max_images=100):
    print("Loading background images...")
    bg_images = []
    valid_prefixes = ['bed', 'din', 'kitchen', 'living']
    
    # List all files in the background directory
    for filename in os.listdir(bg_dir):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            # Check if filename starts with valid prefixes
            prefix = filename.split('_')[0] if '_' in filename else ''
            if prefix in valid_prefixes:
                bg_path = os.path.join(bg_dir, filename)
                bg_images.append(bg_path)
                if len(bg_images) >= max_images:  # Limit number of background images
                    break
    
    print(f"Found {len(bg_images)} valid background images")
    return bg_images

# Load background images - only load a subset to save memory
background_images = load_background_images(background_dir, max_images=1000)
clean_memory()

# Function for histogram equalization to normalize brightness
def equalize_brightness(image):
    """
    Apply histogram equalization to normalize brightness across images.
    Works in HSV color space to preserve hue while equalizing value channel.
    
    Args:
        image: Input image as numpy array (BGR format)
        
    Returns:
        Image with equalized brightness
    """
    # Convert to HSV
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Equalize the V channel
    hsv[:,:,2] = cv2.equalizeHist(hsv[:,:,2])
    
    # Convert back to BGR
    result = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    
    # Clean up intermediate variables
    del hsv
    return result

# Function for unsharp masking to sharpen images
def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
    """
    Apply unsharp masking to sharpen an image.
    
    Args:
        image: Input image as numpy array
        kernel_size: Size of Gaussian blur kernel
        sigma: Standard deviation of Gaussian blur
        amount: Weight of sharpening effect (higher = sharper)
        threshold: Minimum brightness difference to apply sharpening
        
    Returns:
        Sharpened image
    """
    # Create the blurred version of the image
    blurred = cv2.GaussianBlur(image, kernel_size, sigma)
    
    # Calculate the high-frequency details
    sharpened = float(amount + 1) * image - float(amount) * blurred
    
    # Clip the values to valid range and convert back to uint8
    sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
    
    # Apply threshold mask if specified
    if threshold > 0:
        low_contrast_mask = np.absolute(image - blurred) < threshold
        sharpened[low_contrast_mask] = image[low_contrast_mask]
    
    # Clean up
    del blurred, low_contrast_mask
    return sharpened

# Function for adaptive brightness normalization using CLAHE
def adaptive_brightness_normalization(image, clip_limit=2.0, tile_grid_size=(8, 8)):
    """
    Apply Contrast Limited Adaptive Histogram Equalization (CLAHE)
    for more localized brightness normalization.
    
    Args:
        image: Input image (BGR format)
        clip_limit: Threshold for contrast limiting
        tile_grid_size: Size of grid for histogram equalization
        
    Returns:
        Image with adaptively normalized brightness
    """
    # Convert to LAB color space (L channel is for lightness)
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    
    # Create CLAHE object
    clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
    
    # Apply CLAHE to L channel
    lab[:,:,0] = clahe.apply(lab[:,:,0])
    
    # Convert back to BGR
    result = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
    
    # Clean up
    del lab, clahe
    return result

def remove_background(image, background_images):
    # No resizing - preserve original image dimensions
    
    try:
        # Choose between standard histogram equalization and adaptive CLAHE
        use_adaptive = True  # Set to False to use standard equalization
        
        if use_adaptive:
            # Apply adaptive brightness normalization using CLAHE
            image = adaptive_brightness_normalization(image, clip_limit=2.0, tile_grid_size=(8, 8))
        else:
            # Apply standard histogram equalization to normalize brightness
            image = equalize_brightness(image)
        
        # Apply unsharp masking to sharpen the image
        image = unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.5, threshold=5)
        
        # Convert OpenCV image (BGR) to PIL Image (RGB)
        if isinstance(image, np.ndarray):
            pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        else:
            pil_image = image
        
        # Remove background using rembg
        output_image = remove(pil_image, 
                            session=session,
                            alpha_matting=True,
                            alpha_matting_foreground_threshold=240,
                            alpha_matting_background_threshold=10,
                            alpha_matting_erode_size=10,
                            post_process_mask=True)
        
        # Free memory
        del pil_image
        
        # Convert back to numpy array for further processing
        output_array = np.array(output_image)
        del output_image
        
        # Extract alpha channel as mask
        if output_array.shape[2] == 4:
            mask = output_array[:, :, 3] / 255.0
            rgb_image = output_array[:, :, :3]
            
            # Improve mask with morphological operations
            # Convert mask to binary image
            binary_mask = (mask > 0.5).astype(np.uint8)
            
            # Apply morphological closing to fill small holes
            kernel = np.ones((5, 5), np.uint8)
            improved_mask = cv2.morphologyEx(binary_mask, cv2.MORPH_CLOSE, kernel)
            
            # Apply dilation to expand the mask slightly and recover potentially missing body parts
            improved_mask = cv2.dilate(improved_mask, kernel, iterations=1)
            
            # Convert back to float mask
            mask = improved_mask.astype(float)
            
            # Clean up intermediate variables
            del binary_mask, improved_mask, kernel
        else:
            del output_array
            return np.array(pil_image)
        
        # Apply skin transformation (new code)
        # Convert to HSV for better skin detection
        hsv_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2HSV)
        
        # Define HSV range for common skin tones (this is a basic approach and may need adjustment)
        lower_skin = np.array([0, 20, 70], dtype=np.uint8)
        upper_skin = np.array([20, 150, 255], dtype=np.uint8)
        
        # Create a binary mask of skin regions
        skin_mask = cv2.inRange(hsv_image, lower_skin, upper_skin)
        
        # Option 1: Remove skin color by desaturating it
        # Where skin is detected, reduce saturation to make it grayscale
        hsv_image[skin_mask > 0, 1] = 30  # Lower saturation value
        
        # Convert back to RGB color space
        rgb_image = cv2.cvtColor(hsv_image, cv2.COLOR_HSV2RGB)
        
        # Clean up
        del hsv_image, skin_mask, lower_skin, upper_skin
        
        # Use random house background instead of computer-generated background
        if background_images:
            # Select a random background image
            bg_path = random.choice(background_images)
            
            try:
                # Load the background image
                background = cv2.imread(bg_path)
                if background is None:
                    raise Exception(f"Could not load background image: {bg_path}")
                
                # Convert from BGR to RGB
                background = cv2.cvtColor(background, cv2.COLOR_BGR2RGB)
                
                # Resize background to match foreground image size
                h, w = rgb_image.shape[:2]
                background = cv2.resize(background, (w, h))
                
                # Apply histogram equalization to the background for consistent brightness
                background_bgr = cv2.cvtColor(background, cv2.COLOR_RGB2BGR)
                background_eq = equalize_brightness(background_bgr)
                background = cv2.cvtColor(background_eq, cv2.COLOR_BGR2RGB)
                
                # Clean up
                del background_bgr, background_eq
                
            except Exception as e:
                print(f"Error loading background image ({bg_path}): {str(e)}")
                # Fallback to a solid color background if loading fails
                bg_color = np.array([random.randint(0, 255), 
                                    random.randint(0, 255), 
                                    random.randint(0, 255)])
                background = np.ones_like(rgb_image) * bg_color.reshape(1, 1, 3)
                del bg_color
        else:
            # Fallback if no background images are available
            print("Warning: No background images available, using solid color.")
            bg_color = np.array([random.randint(0, 255), 
                                random.randint(0, 255), 
                                random.randint(0, 255)])
            background = np.ones_like(rgb_image) * bg_color.reshape(1, 1, 3)
            del bg_color
        
        # Merge foreground and background using mask
        mask_3d = np.expand_dims(mask, axis=2).repeat(3, axis=2)
        segmented_image = (rgb_image * mask_3d + background * (1 - mask_3d)).astype(np.uint8)
        
        # Clean up
        del mask, mask_3d, rgb_image, background, output_array
        
        return segmented_image
    
    except Exception as e:
        print(f"Error in remove_background: {str(e)}")
        # Return original image if processing fails
        return image

# Enhanced augmentation pipeline with more diverse transformations
augmentation = A.Compose([
    # Color transformations (reduced probabilities)
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.4),
    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=15, val_shift_limit=10, p=0.3),
    A.CLAHE(clip_limit=2.0, tile_grid_size=(8, 8), p=0.2),
    A.ToGray(p=0.05),  # Reduced probability
    
    # Additional color manipulations (reduced probabilities)
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.2),
    A.ChannelShuffle(p=0.05),  # Reduced probability
    A.ChannelDropout(channel_drop_range=(1, 1), fill_value=0, p=0.05),  # Reduced probability
    
    # New advanced color transforms with low probabilities
    A.RandomGamma(gamma_limit=(80, 120), p=0.1),  # Random gamma adjustment
    A.FancyPCA(alpha=0.1, p=0.05),  # Add color perturbations along principal components
    
    # Noise and blur effects (significantly reduced)
    A.GaussNoise(var_limit=(1.0, 8.0), p=0.1),  # Significantly reduced variance and probability
    A.GaussianBlur(blur_limit=(1, 3), p=0.1),  # Reduced probability
    A.MotionBlur(blur_limit=3, p=0.05),  # Reduced blur limit and probability
    A.ISONoise(color_shift=(0.01, 0.02), intensity=(0.05, 0.2), p=0.1),  # Reduced intensity and probability
    A.PixelDropout(dropout_prob=0.001, p=0.05),  # Significantly reduced dropout probability
    
    # New blur and texture effects
    A.AdvancedBlur(blur_limit=(3, 5), p=0.05),  # More sophisticated blur
    A.Defocus(radius=(2, 4), alias_blur=(0.1, 0.5), p=0.05),  # Defocus blur
    A.Superpixels(p_replace=0.1, n_segments=100, p=0.05),  # Superpixel segmentation
    A.UnsharpMask(blur_limit=(3, 5), p=0.05),  # Sharpening through unsharp mask
    A.RingingOvershoot(blur_limit=(3, 5), p=0.05),  # Ringing artifacts like from JPEG compression
    
    # Enhanced geometric transformations (adjusted probabilities)
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.15, rotate_limit=15, 
                      border_mode=cv2.BORDER_CONSTANT, p=0.4),  # Reduced probability
    A.Perspective(scale=(0.05, 0.15), p=0.3),  # Reduced probability
    A.GridDistortion(num_steps=5, distort_limit=0.2, p=0.15),  # Reduced probability
    A.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=0.1),  # Reduced probability
    
    # Lens distortion simulation
    A.OpticalDistortion(distort_limit=0.2, shift_limit=0.1, p=0.15),  # Reduced probability
])

# Reduced number of augmentations per image to save space
num_augmentations = 4  # Restored to original number

# Function to clean temporary files to free up space
def clean_temp_files():
    """Remove all files in the temp directory"""
    if os.path.exists(temp_dir):
        for file in os.listdir(temp_dir):
            file_path = os.path.join(temp_dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)
    print("Temporary files cleaned.")

# Batch processing function to handle subsets of classes
def process_class_batch(class_batch, batch_index, total_batches):
    global processed_count
    
    for class_folder in tqdm(class_batch, desc=f"Processing batch {batch_index+1}/{total_batches}"):
        class_path = os.path.join(input_dir, class_folder)
        
        # Skip if not a directory
        if not os.path.isdir(class_path):
            continue
        
        # Create output class folder
        output_class_dir = os.path.join(output_dir, class_folder)
        os.makedirs(output_class_dir, exist_ok=True)
        
        # Get all images in this class
        image_files = [f for f in os.listdir(class_path) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        # Process only a subset of images if the directory is too large
        max_images_per_class = 100  # Reduced limit of images per class
        if len(image_files) > max_images_per_class:
            image_files = image_files[:max_images_per_class]
        
        # Process each image
        for img_file in tqdm(image_files, desc=f"Processing {class_folder}", leave=False):
            # Check disk space before processing
            if not check_disk_space(min_required_mb=200):  # Need at least 200MB free
                print("WARNING: Low disk space. Cleaning temporary files...")
                clean_temp_files()
                clean_memory()
                if not check_disk_space(min_required_mb=100):
                    print("CRITICAL: Extremely low disk space. Skipping further processing.")
                    return
            
            img_path = os.path.join(class_path, img_file)
            
            # Skip processing if output already exists to avoid duplicates
            base_name, _ = os.path.splitext(img_file)
            original_output_path = os.path.join(output_class_dir, base_name + ".png")
            if os.path.exists(original_output_path):
                continue
            
            # Read the image
            image = cv2.imread(img_path)
            if image is None:
                print(f"Warning: Could not read {img_path}")
                continue
            
            try:
                # Remove background and add a new one
                segmented_image = remove_background(image, background_images)
                
                # Free original image memory
                del image
                
                # Save segmented original image as PNG with compression
                cv2.imwrite(original_output_path, cv2.cvtColor(segmented_image, cv2.COLOR_RGB2BGR))
                
                # Generate augmented versions (fewer than before)
                for i in range(num_augmentations):
                    augmented = augmentation(image=segmented_image)
                    aug_image = augmented['image']
                    
                    # Create augmented filename
                    aug_filename = f"{base_name}_aug{i+1}.png"
                    aug_output_path = os.path.join(output_class_dir, aug_filename)
                    
                    # Save augmented image as PNG with compression
                    cv2.imwrite(aug_output_path, cv2.cvtColor(aug_image, cv2.COLOR_RGB2BGR))
                    
                    # Free augmented image memory
                    del aug_image
                
                # Free segmented image memory
                del segmented_image
                
                processed_count += 1
                
                # Clear memory more frequently
                if processed_count % 10 == 0:
                    clean_memory()
                    
            except Exception as e:
                print(f"Error processing {img_path}: {str(e)}")
                # If segmentation fails, save the original image with compression
                if 'image' in locals():
                    cv2.imwrite(original_output_path, image)
                    del image
        
        # Clean up after each class to free space
        clean_temp_files()
        clean_memory()

# Function to zip processed files in smaller batches
def create_zip_in_batches(source_dir, zip_path, batch_size=100):
    """
    Create a zip file in batches to avoid memory issues
    
    Args:
        source_dir: Directory to zip
        zip_path: Path to save the zip file
        batch_size: Number of files to process in each batch
    """
    print(f"Creating zip archive of {source_dir} in batches...")
    
    # Count total files to zip
    total_files = 0
    for root, _, files in os.walk(source_dir):
        total_files += len(files)
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        file_count = 0
        batch_count = 0
        
        for root, _, files in os.walk(source_dir):
            for i, file in enumerate(files):
                if file_count % batch_size == 0:
                    batch_count += 1
                    print(f"Processing batch {batch_count} ({file_count}/{total_files} files)")
                    # Force garbage collection between batches
                    clean_memory()
                    
                    # Check disk space
                    if not check_disk_space(min_required_mb=100):
                        print("WARNING: Low disk space during zip creation. Trying to free up space...")
                        # Try to remove some files that have already been zipped
                        # This is risky but might be necessary in extreme space constraints
                        for sub_root, _, sub_files in os.walk(source_dir):
                            for sub_file in sub_files[:5]:  # Remove up to 5 files
                                if os.path.exists(os.path.join(sub_root, sub_file)):
                                    os.remove(os.path.join(sub_root, sub_file))
                
                file_path = os.path.join(root, file)
                arc_path = os.path.relpath(file_path, os.path.dirname(source_dir))
                
                try:
                    # Add file to zip
                    zipf.write(file_path, arc_path)
                    
                    # Delete original file after adding to zip to save space
                    os.remove(file_path)
                    
                    file_count += 1
                except Exception as e:
                    print(f"Error adding {file_path} to zip: {str(e)}")
                
                # Check if we need to stop due to space issues
                if file_count % 10 == 0 and not check_disk_space(min_required_mb=50):
                    print("CRITICAL: Extremely low disk space. Stopping zip creation.")
                    break
            
            # Clean empty directories
            if len(os.listdir(root)) == 0 and root != source_dir:
                os.rmdir(root)

# Main processing function
def main():
    global processed_count
    processed_count = 0
    
    try:
        # Get all class folders
        class_folders = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]
        
        # Process in smaller batches
        batch_size = 5  # Process 5 classes at a time
        num_batches = (len(class_folders) + batch_size - 1) // batch_size
        
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(class_folders))
            batch = class_folders[start_idx:end_idx]
            
            # Process this batch
            process_class_batch(batch, i, num_batches)
            
            # Check if we should continue
            if not check_disk_space(min_required_mb=500):
                print("Low disk space. Creating zip archive of processed data so far...")
                create_zip_in_batches(output_dir, "/kaggle/working/augmented_isl_dataset_partial.zip")
                # Clean up the output directory after zipping
                shutil.rmtree(output_dir)
                os.makedirs(output_dir, exist_ok=True)
        
        print(f"Dataset processing complete. Augmented dataset saved to {output_dir}")
        print(f"The dataset maintains the original directory structure with {num_augmentations} augmented copies per image.")
        print(f"Total processed images: {processed_count}")
        
        # Create a zip archive of the processed dataset in batches
        print("Creating final zip archive...")
        zip_filename = "/kaggle/working/augmented_isl_dataset.zip"
        create_zip_in_batches(output_dir, zip_filename)
        
        # Remove the unzipped directory to save space
        print(f"Removing the unzipped directory {output_dir} to save space...")
        shutil.rmtree(output_dir)
        print("Unzipped directory removed successfully.")
        
        # Clean up temp directory
        clean_temp_files()
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        # Final memory cleanup
        clean_memory()
        
    except Exception as e:
        print(f"Error in main processing: {str(e)}")
        # Try to clean up even if there's an error
        clean_temp_files()
        clean_memory()

# Run the main function
if __name__ == "__main__":
    main() 