In [2]:
from PIL import Image, ImageFile
import os
import concurrent.futures
os.chdir('/tf-acno-projects/image-classification/')

In [3]:
data_dir = 'faces_data/'
allowed_extensions = ['.jpeg', '.jpg', '.png', '.bmp']
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [4]:
def is_valid_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

In [5]:
def get_file_extension(file_path):
    return os.path.splitext(file_path)[1].lower()

In [6]:
# Function to process a single image
def process_image(image_path):
    # Check file extension
    if get_file_extension(image_path) not in allowed_extensions:
        print(f"Removed {image_path} due to invalid extension.")
        os.remove(image_path)
        return

    # Verify image integrity
    if not is_valid_image(image_path):
        print(f"Removed {image_path} due to file corruption or invalid format.")
        os.remove(image_path)
        return

    try:
        with Image.open(image_path) as img:
            # Convert to RGB if it's not
            if img.mode in ['P', 'L', 'RGBA']:  # Handle palette, grayscale, and RGBA
                img = img.convert('RGB')
                print(f"Converting {os.path.splitext(image_path)[0]} to RGB")
            img = img.resize((128, 128))
             # Save image without ICC profile to avoid iCCP warnings
            img.save(image_path, format="PNG", icc_profile=None)
            
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        os.remove(image_path)

In [7]:
# Function to process images in parallel
def process_images_in_parallel():
    image_paths = []
    # Walk through all directories and collect all image paths
    for image_class in os.listdir(data_dir):
        class_dir = os.path.join(data_dir, image_class)
        if not os.path.isdir(class_dir):
            continue
        
        image_paths += [os.path.join(class_dir, image_name) for image_name in os.listdir(class_dir)]
    
    # Process images in parallel
    with concurrent.futures.ProcessPoolExecutor() as executor:
        executor.map(process_image, image_paths)

In [8]:
process_images_in_parallel()
print("Image preprocessing completed.")

Image preprocessing completed.
