In [1]:
import os
import random
import zipfile

zip_file_path = "data_archive\CelebA\img_align_celeba.zip"
output_path = "data_archive\CelebA"

train_dir = os.path.join(output_path, "train_gt")
val_dir = os.path.join(output_path, "val_gt")
test_dir = os.path.join(output_path, "test_gt")

for d in [train_dir, val_dir, test_dir]:
    os.makedirs(d, exist_ok=True)

try:
    with zipfile.ZipFile(zip_file_path, 'r', allowZip64=True) as zip_ref:
        images = zip_ref.namelist()
        images = [img for img in images if img.endswith(".jpg")]
        random.shuffle(images)
        total_images = len(images)
        train_size = int(0.8 * total_images)
        val_size = int(0.05 * total_images)
        test_size = total_images - train_size - val_size
        train_images = images[:train_size]
        val_images = images[train_size:train_size + val_size]
        test_images = images[train_size + val_size:]

        def extract_and_copy(image_list, target_dir):
            for img in image_list:
                try:
                    img_data = zip_ref.read(img)
                    output_file_path = os.path.join(target_dir, os.path.basename(img))
                    with open(output_file_path, "wb") as f:
                        f.write(img_data)
                except PermissionError as e:
                    print(f"Permission error while accessing {img}: {e}")

        extract_and_copy(train_images, train_dir)
        extract_and_copy(val_images, val_dir)
        extract_and_copy(test_images, test_dir)

    print("Dataset split completed without full extraction!")

except FileNotFoundError:
    print("Error: The specified ZIP file was not found.")
except PermissionError:
    print("Error: Permission denied while accessing the ZIP file. Try running as administrator.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

  zip_file_path = "data_archive\CelebA\img_align_celeba.zip"
  output_path = "data_archive\CelebA"


Dataset split completed without full extraction!


In [2]:
import os
from PIL import Image
import torch
import torchvision.transforms as transforms

def resize_images_in_folder(folder_path, target_size=(256, 256)):
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')
    
    # Define the transform
    transform = transforms.Compose([
        transforms.Resize(target_size),
        transforms.ToTensor()
    ])
    
    for filename in os.listdir(folder_path):
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            file_path = os.path.join(folder_path, filename)
            with Image.open(file_path) as img:
                if img.size != target_size:
                    # Convert image to tensor and move to device
                    img_tensor = transform(img).unsqueeze(0).to(device)
                    
                    # Convert tensor back to image and save
                    img_resized = transforms.ToPILImage()(img_tensor.squeeze(0).cpu())
                    img_resized.save(file_path)
                    print(f'Resized and saved: {file_path}')
                else:
                    print(f'Skipped (already {target_size}): {file_path}')

In [3]:
resize_images_in_folder('data_archive/CelebA/test_gt')
resize_images_in_folder('data_archive/CelebA/train_gt')
resize_images_in_folder('data_archive/CelebA/val_gt')
# resize_images_in_folder('data_archive/Masks/train')

Using device: cuda
Resized and saved: data_archive/CelebA/test_gt\000013.jpg
Resized and saved: data_archive/CelebA/test_gt\000017.jpg
Resized and saved: data_archive/CelebA/test_gt\000023.jpg
Resized and saved: data_archive/CelebA/test_gt\000030.jpg
Resized and saved: data_archive/CelebA/test_gt\000041.jpg
Resized and saved: data_archive/CelebA/test_gt\000049.jpg
Resized and saved: data_archive/CelebA/test_gt\000050.jpg
Resized and saved: data_archive/CelebA/test_gt\000053.jpg
Resized and saved: data_archive/CelebA/test_gt\000061.jpg
Resized and saved: data_archive/CelebA/test_gt\000063.jpg
Resized and saved: data_archive/CelebA/test_gt\000078.jpg
Resized and saved: data_archive/CelebA/test_gt\000079.jpg
Resized and saved: data_archive/CelebA/test_gt\000083.jpg
Resized and saved: data_archive/CelebA/test_gt\000086.jpg
Resized and saved: data_archive/CelebA/test_gt\000090.jpg
Resized and saved: data_archive/CelebA/test_gt\000098.jpg
Resized and saved: data_archive/CelebA/test_gt\000111

KeyboardInterrupt: 

In [4]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from glob import glob

def calculate_mask_coverage(mask_path):
    """
    Calculate what percentage of the mask is covered (non-zero).
    
    Args:
        mask_path: Path to the mask image
    
    Returns:
        float: Percentage of mask coverage (0-100)
    """
    # Read the mask
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
    
    if mask is None:
        print(f"Error loading mask: {mask_path}")
        return 0.0
    
    # Count non-zero pixels
    non_zero = np.count_nonzero(mask)
    
    # Calculate coverage percentage
    total_pixels = mask.shape[0] * mask.shape[1]
    coverage_percent = (non_zero / total_pixels) * 100
    print(f"Coverage: {coverage_percent:.2f}%")
    
    return coverage_percent


# calculate_mask_coverage('data_archive/Masks/train/00001_test.png')

In [5]:
def process_masks_folder(mask_folder, output_folder, min_coverage=20, visualize=False):
    """
    Process a folder of masks:
    1. Check coverage percentage
    2. Delete masks with coverage < min_coverage
    3. Convert remaining masks to binary and invert them
    
    Args:
        mask_folder: Folder containing mask images
        output_folder: Folder to save processed masks
        min_coverage: Minimum coverage percentage required (default: 20%)
        visualize: If True, visualize a sample of processed masks
    
    Returns:
        tuple: (kept_masks, removed_masks) - Lists of kept and removed mask paths
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all mask files
    mask_paths = glob(os.path.join(mask_folder, "*.png"))
    kept_masks = []
    removed_masks = []
    
    print(f"Processing {len(mask_paths)} masks...")
    
    # Simple progress tracking
    total = len(mask_paths)
    
    for i, mask_path in enumerate(mask_paths):
        # Print progress periodically
        if (i+1) % 10 == 0 or (i+1) == total:
            print(f"Progress: {i+1}/{total} masks ({(i+1)/total*100:.1f}%)")
        
        # Calculate coverage
        coverage = calculate_mask_coverage(mask_path)
        
        # Get mask filename
        mask_filename = os.path.basename(mask_path)
        
        if (coverage >= 85):
            # Add to removed list
            removed_masks.append(mask_path)
            continue
        
        # Keep this mask
        kept_masks.append(mask_path)
        
        # Read mask
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        
        # Save inverted binary mask
        output_path = os.path.join(output_folder, mask_filename)
        cv2.imwrite(output_path, mask)
    
    print(f"Kept {len(kept_masks)} masks with ≥{min_coverage}% coverage")
    print(f"Removed {len(removed_masks)} masks with <{min_coverage}% coverage")
    
    return kept_masks, removed_masks

In [6]:

# Example usage
mask_folder = "data_archive/Masks/train"
output_folder = "data_archive/Masks/processed"

kept, removed = process_masks_folder(mask_folder, output_folder, min_coverage=60, visualize=True)

Processing 60000 masks...
Coverage: 88.64%
Coverage: 92.18%
Coverage: 81.06%
Coverage: 86.24%
Coverage: 77.54%
Coverage: 76.60%
Coverage: 92.78%
Coverage: 91.38%
Coverage: 88.20%
Progress: 10/60000 masks (0.0%)
Coverage: 85.12%
Coverage: 84.74%
Coverage: 90.16%
Coverage: 97.97%
Coverage: 89.18%
Coverage: 78.84%
Coverage: 88.54%
Coverage: 76.01%
Coverage: 85.90%
Coverage: 90.15%
Progress: 20/60000 masks (0.0%)
Coverage: 82.86%
Coverage: 96.03%
Coverage: 90.42%
Coverage: 91.96%
Coverage: 62.73%
Coverage: 85.95%
Coverage: 92.07%
Coverage: 92.90%
Coverage: 87.36%
Coverage: 71.64%
Progress: 30/60000 masks (0.1%)
Coverage: 77.57%
Coverage: 88.45%
Coverage: 83.87%
Coverage: 97.73%
Coverage: 84.61%
Coverage: 93.54%
Coverage: 72.68%
Coverage: 83.15%
Coverage: 95.60%
Coverage: 95.19%
Progress: 40/60000 masks (0.1%)
Coverage: 61.55%
Coverage: 93.87%
Coverage: 94.75%
Coverage: 85.76%
Coverage: 73.17%
Coverage: 85.15%
Coverage: 83.43%
Coverage: 89.49%
Coverage: 84.03%
Coverage: 85.73%
Progress: 50/

In [12]:
def create_input_image(input_path, mask_path, output_path):

    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.imread(input_path)
    masked_image = cv2.bitwise_and(img, img, mask=mask)

    mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
    masked_image = masked_image + (mask == 0) * 255 

    return cv2.imwrite(output_path, masked_image)
    

In [15]:
import os
import cv2
import glob
import numpy as np
from tqdm import tqdm

def process_image_folder(input_folder, mask_folder, output_folder, file_ext='.jpg'):
    """
    Process all images in a folder, applying a random mask from the mask_folder
    
    Args:
        input_folder: Path to folder containing original images
        mask_folder: Path to folder containing mask images
        output_folder: Path to save processed images
        file_ext: File extension of images to process (default: .jpg)
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get lists of all images and masks
    image_paths = glob.glob(os.path.join(input_folder, f'*{file_ext}'))
    mask_paths = glob.glob(os.path.join(mask_folder, '*.png'))  # Assuming masks are PNGs
    
    if not image_paths:
        print(f"No images with extension {file_ext} found in {input_folder}")
        return
    
    if not mask_paths:
        print(f"No mask images found in {mask_folder}")
        return
    
    print(f"Found {len(image_paths)} images and {len(mask_paths)} masks")
    print(f"Processing images from {input_folder} to {output_folder}")
    
    # Process each image with a randomly selected mask
    successful = 0
    failed = 0

    total_images = len(image_paths)
    
    for i, img_path in enumerate(image_paths):
        # Show progress
        if (i+1) % 100 == 0 or (i+1) == len(image_paths):
            print(f"Progress: {i+1}/{len(image_paths)} images ({(i+1)/total_images*100:.1f}%)")
        
        # Get random mask
        mask_path = np.random.choice(mask_paths)
        
        # Get output filename (preserve original filename)
        filename = os.path.basename(img_path)
        output_path = os.path.join(output_folder, filename)
        
        # Process the image
        success = create_input_image(img_path, mask_path, output_path)
        
        if success:
            successful += 1
        else:
            failed += 1
    
    print(f"Done! {successful} images processed successfully, {failed} failed.")
    return successful, failed

In [16]:
process_image_folder(
    input_folder='data_archive/CelebA/train_gt',
    mask_folder='data_archive/Masks/train',
    output_folder='data_archive/CelebA/train_input'
)

Found 162079 images and 60000 masks
Processing images from data_archive/CelebA/train_gt to data_archive/CelebA/train_input
Progress: 100/162079 images (0.1%)
Progress: 200/162079 images (0.1%)
Progress: 300/162079 images (0.2%)
Progress: 400/162079 images (0.2%)
Progress: 500/162079 images (0.3%)
Progress: 600/162079 images (0.4%)
Progress: 700/162079 images (0.4%)
Progress: 800/162079 images (0.5%)
Progress: 900/162079 images (0.6%)
Progress: 1000/162079 images (0.6%)
Progress: 1100/162079 images (0.7%)
Progress: 1200/162079 images (0.7%)
Progress: 1300/162079 images (0.8%)
Progress: 1400/162079 images (0.9%)
Progress: 1500/162079 images (0.9%)
Progress: 1600/162079 images (1.0%)
Progress: 1700/162079 images (1.0%)
Progress: 1800/162079 images (1.1%)
Progress: 1900/162079 images (1.2%)
Progress: 2000/162079 images (1.2%)
Progress: 2100/162079 images (1.3%)
Progress: 2200/162079 images (1.4%)
Progress: 2300/162079 images (1.4%)
Progress: 2400/162079 images (1.5%)
Progress: 2500/162079 

(162079, 0)

In [None]:
process_image_folder(
    input_folder='data_archive/CelebA/test_gt',
    mask_folder='data_archive/Masks/train',
    output_folder='data_archive/CelebA/test_input'
)

Found 30391 images and 19658 masks
Processing images from data_archive/CelebA/test_gt to data_archive/CelebA/test_input
Progress: 10/30391 images (0.0%)
Progress: 20/30391 images (0.1%)
Progress: 30/30391 images (0.1%)
Progress: 40/30391 images (0.1%)
Progress: 50/30391 images (0.2%)
Progress: 60/30391 images (0.2%)
Progress: 70/30391 images (0.2%)
Progress: 80/30391 images (0.3%)
Progress: 90/30391 images (0.3%)
Progress: 100/30391 images (0.3%)
Progress: 110/30391 images (0.4%)
Progress: 120/30391 images (0.4%)
Progress: 130/30391 images (0.4%)
Progress: 140/30391 images (0.5%)
Progress: 150/30391 images (0.5%)
Progress: 160/30391 images (0.5%)
Progress: 170/30391 images (0.6%)
Progress: 180/30391 images (0.6%)
Progress: 190/30391 images (0.6%)
Progress: 200/30391 images (0.7%)
Progress: 210/30391 images (0.7%)
Progress: 220/30391 images (0.7%)
Progress: 230/30391 images (0.8%)
Progress: 240/30391 images (0.8%)
Progress: 250/30391 images (0.8%)
Progress: 260/30391 images (0.9%)
Progr

(30391, 0)

In [9]:
process_image_folder(
    input_folder='data_archive/CelebA/val_gt',
    mask_folder='data_archive/Masks/train',
    output_folder='data_archive/CelebA/val_input'
)

Found 10129 images and 60000 masks
Processing images from data_archive/CelebA/val_gt to data_archive/CelebA/val_input
Progress: 10/10129 images (0.1%)
Progress: 20/10129 images (0.2%)
Progress: 30/10129 images (0.3%)
Progress: 40/10129 images (0.4%)
Progress: 50/10129 images (0.5%)
Progress: 60/10129 images (0.6%)
Progress: 70/10129 images (0.7%)
Progress: 80/10129 images (0.8%)
Progress: 90/10129 images (0.9%)
Progress: 100/10129 images (1.0%)
Progress: 110/10129 images (1.1%)
Progress: 120/10129 images (1.2%)
Progress: 130/10129 images (1.3%)
Progress: 140/10129 images (1.4%)
Progress: 150/10129 images (1.5%)
Progress: 160/10129 images (1.6%)
Progress: 170/10129 images (1.7%)
Progress: 180/10129 images (1.8%)
Progress: 190/10129 images (1.9%)
Progress: 200/10129 images (2.0%)
Progress: 210/10129 images (2.1%)
Progress: 220/10129 images (2.2%)
Progress: 230/10129 images (2.3%)
Progress: 240/10129 images (2.4%)
Progress: 250/10129 images (2.5%)
Progress: 260/10129 images (2.6%)
Progres

KeyboardInterrupt: 

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.6.0+cu118
True
