In [5]:
import os
from PIL import Image
from tqdm import tqdm

In [6]:
# Paths
source_dir = r"E:\IRP_dataset_new\IRP_combined_processed_images"
target_dir = r"E:\IRP_dataset_new\IRP_Final_Images"
labels_file = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"

In [7]:
# Create target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

def resize_images(source_directory, target_directory, size=(224, 224)):
    """
    Resize all images from source directory to the specified size and save to target directory.
    Maintains the original filename to preserve ID codes.
    """
    # Valid image extensions
    extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']
    
    # Get list of files to process
    image_files = []
    for root, _, files in os.walk(source_directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                image_files.append(os.path.join(root, file))
    
    print(f"Found {len(image_files)} images to resize")
    
    # Track progress and any errors
    successful = 0
    failed = 0
    failed_files = []
    
    # Process each image with a progress bar
    for file_path in tqdm(image_files, desc="Resizing images"):
        try:
            # Get just the filename (without directory path)
            filename = os.path.basename(file_path)
            
            # Open and resize the image
            with Image.open(file_path) as img:
                # Resize with antialiasing
                resized_img = img.resize(size, Image.LANCZOS)
                
                # Save to target directory with same filename
                target_path = os.path.join(target_directory, filename)
                
                # Save with original format and high quality
                resized_img.save(target_path, quality=95)
                
            successful += 1
            
        except Exception as e:
            failed += 1
            failed_files.append((filename, str(e)))
            print(f"\nError processing {filename}: {e}")
    
    return successful, failed, failed_files

def verify_resize_results(source_directory, target_directory):
    """
    Verify all images were resized correctly by checking:
    1. All source images have a corresponding target image
    2. All target images are 224x224
    """
    # Get lists of source and target files
    source_files = []
    for root, _, files in os.walk(source_directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']):
                source_files.append(os.path.basename(file))
    
    target_files = []
    incorrect_size = []
    
    for root, _, files in os.walk(target_directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']):
                target_files.append(file)
                
                # Check dimensions
                try:
                    img_path = os.path.join(root, file)
                    with Image.open(img_path) as img:
                        if img.size != (224, 224):
                            incorrect_size.append((file, img.size))
                except Exception as e:
                    print(f"Error checking size of {file}: {e}")
    
    source_set = set(source_files)
    target_set = set(target_files)
    
    # Find missing files
    missing_files = source_set - target_set
    
    return {
        'total_source': len(source_files),
        'total_target': len(target_files),
        'missing_files': list(missing_files),
        'missing_count': len(missing_files),
        'incorrect_size': incorrect_size,
        'incorrect_size_count': len(incorrect_size)
    }

def main():
    print(f"Starting image resize process...")
    print(f"Source directory: {source_dir}")
    print(f"Target directory: {target_dir}")
    
    # Resize images
    successful, failed, failed_files = resize_images(source_dir, target_dir)
    
    print("\nResize completed:")
    print(f"  Successfully resized: {successful} images")
    print(f"  Failed to resize: {failed} images")
    
    if failed > 0:
        print("\nFirst 10 failed files:")
        for i, (file, error) in enumerate(failed_files[:10]):
            print(f"  {i+1}. {file}: {error}")
    
    # Verify results
    print("\nVerifying results...")
    results = verify_resize_results(source_dir, target_dir)
    
    print(f"Source images: {results['total_source']}")
    print(f"Resized images: {results['total_target']}")
    
    if results['missing_count'] > 0:
        print(f"\n{results['missing_count']} images were not resized.")
        print("First 10 missing files:")
        for file in results['missing_files'][:10]:
            print(f"  - {file}")
    
    if results['incorrect_size_count'] > 0:
        print(f"\n{results['incorrect_size_count']} images were not correctly resized to 224x224.")
        print("First 10 incorrect sizes:")
        for file, size in results['incorrect_size'][:10]:
            print(f"  - {file}: {size}")
    
    if results['missing_count'] == 0 and results['incorrect_size_count'] == 0:
        print("\nAll images were successfully resized to 224x224!")
    
    print("\nNOTE: This process did not modify any image IDs or the label file.")
    print(f"Original labels file at {labels_file} remains unchanged.")

if __name__ == "__main__":
    main()

Starting image resize process...
Source directory: E:\IRP_dataset_new\IRP_combined_processed_images
Target directory: E:\IRP_dataset_new\IRP_Final_Images
Found 10832 images to resize


Resizing images: 100%|██████████| 10832/10832 [43:18<00:00,  4.17it/s] 



Resize completed:
  Successfully resized: 10832 images
  Failed to resize: 0 images

Verifying results...
Source images: 10832
Resized images: 10832

All images were successfully resized to 224x224!

NOTE: This process did not modify any image IDs or the label file.
Original labels file at E:\IRP_dataset_new\APTOS_labels_combined.csv remains unchanged.


In [9]:
import os
import pandas as pd

# Paths
images_dir = r"E:\IRP_dataset_new\IRP_Final_Images"
labels_file = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"

print("Checking if all image IDs in the labels file exist in the images directory...")

# Load the labels file
df = pd.read_csv(labels_file)
print(f"Labels file loaded with {len(df)} entries")

# Use 'id_code' as the ID column (from the output you shared)
id_column = 'id_code'
print(f"Using '{id_column}' as the image ID column")

# Get image IDs from the labels file
image_ids_in_labels = set(df[id_column].astype(str))
print(f"Found {len(image_ids_in_labels)} unique image IDs in the labels file")

# Get filenames from the images directory
image_ids_in_directory = set()
for filename in os.listdir(images_dir):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff')):
        # Extract the ID from the filename (without extension)
        image_id = os.path.splitext(filename)[0]
        image_ids_in_directory.add(image_id)

print(f"Found {len(image_ids_in_directory)} images in directory")

# Find missing images
missing_images = image_ids_in_labels - image_ids_in_directory

# Report results
if len(missing_images) == 0:
    print("✓ All image IDs from the labels file exist in the images directory!")
else:
    print(f"✗ Found {len(missing_images)} missing images:")
    for missing in list(missing_images)[:10]:
        print(f"  - {missing}")
    
    if len(missing_images) > 10:
        print(f"  ... and {len(missing_images) - 10} more")
    
# Calculate match percentage
match_percentage = (len(image_ids_in_labels) - len(missing_images)) / len(image_ids_in_labels) * 100
print(f"Match rate: {match_percentage:.2f}% ({len(image_ids_in_labels) - len(missing_images)}/{len(image_ids_in_labels)})")

Checking if all image IDs in the labels file exist in the images directory...
Labels file loaded with 10832 entries
Using 'id_code' as the image ID column
Found 10832 unique image IDs in the labels file
Found 10832 images in directory
✓ All image IDs from the labels file exist in the images directory!
Match rate: 100.00% (10832/10832)
