In [70]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import glob
import shutil
from tqdm.notebook import tqdm  # Using notebook version for better display in Jupyter
import random
import sys

In [71]:
# Define paths
ORIGINAL_IMAGES_PATH = r"E:\IRP_dataset_new\APTOS_images"
ORIGINAL_LABELS_PATH = r"E:\IRP_dataset_new\APTOS_labels_cleaned.csv"
AUGMENTED_IMAGES_PATH = r"E:\IRP_dataset_new\augmented_images"
AUGMENTED_LABELS_PATH = r"E:\IRP_dataset_new\augmented_labels.csv"
COMBINED_IMAGES_PATH = r"E:\IRP_dataset_new\APTOS_combined_images"
COMBINED_LABELS_PATH = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"

# Create output directories if they don't exist
os.makedirs(AUGMENTED_IMAGES_PATH, exist_ok=True)
os.makedirs(COMBINED_IMAGES_PATH, exist_ok=True)

In [72]:
# Read the original labels
df_original = pd.read_csv(ORIGINAL_LABELS_PATH)
print(f"Original dataset contains {len(df_original)} entries")
print(f"Columns in the CSV: {df_original.columns.tolist()}")
print(f"Class distribution: \n{df_original['diagnosis'].value_counts()}")

Original dataset contains 3653 entries
Columns in the CSV: ['id_code', 'diagnosis']
Class distribution: 
diagnosis
1    1853
0    1800
Name: count, dtype: int64


In [73]:
# Explore the image directory
print("\nExploring image directory...")
extensions = ['.png', '.jpg', '.jpeg', '.tif', '.tiff']  # Put .png first based on the user's example
available_images = []
for ext in extensions:
    files = glob.glob(os.path.join(ORIGINAL_IMAGES_PATH, f"*{ext}"))
    available_images.extend(files)

print(f"Found {len(available_images)} image files in {ORIGINAL_IMAGES_PATH}")


Exploring image directory...
Found 3653 image files in E:\IRP_dataset_new\APTOS_images


In [74]:
# Show some sample filenames
if available_images:
    print("Sample image filenames:")
    for img_path in available_images[:5]:
        print(f"  {os.path.basename(img_path)}")

Sample image filenames:
  000c1434d8d7.png
  001639a390f0.png
  0024cdab0c1e.png
  002c21358ce6.png
  005b95c28852.png


In [75]:
# Function to find the file by ID (trying different extensions)
def find_image_file(id_code, directory):
    """Find the image file with the given ID in the directory, trying different extensions"""
    for ext in extensions:
        file_path = os.path.join(directory, f"{id_code}{ext}")
        if os.path.exists(file_path):
            return file_path, ext
    return None, None

In [None]:
# Function to perform simple image augmentation using OpenCV
def augment_image(image, augmentation_type):
    """
    Perform image augmentation using OpenCV
    
    Parameters:
    - image: Input image to augment
    - augmentation_type: Integer specifying the type of augmentation
      1: Rotation + horizontal flip
      2: Translation + brightness + slight zoom
    
    Returns:
    - Augmented image
    """
    result = image.copy()
    
    if augmentation_type == 1:  # Rotation + horizontal flip
        # Rotation (more randomized angle)
        angle = random.uniform(-25, 25)
        height, width = result.shape[:2]
        center = (width/2, height/2)
        rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
        result = cv2.warpAffine(result, rotation_matrix, (width, height), 
                              flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
        
        # Horizontal flip (preserves anatomical correctness)
        if random.random() > 0.5:
            result = cv2.flip(result, 1)
            
        # Minor brightness adjustment for added uniqueness
        brightness = random.uniform(0.9, 1.1)
        result = cv2.convertScaleAbs(result, alpha=brightness, beta=0)
    
    elif augmentation_type == 2:  # Translation + brightness + slight zoom
        # Translation (more randomized shift)
        height, width = result.shape[:2]
        tx = random.uniform(-0.15, 0.15) * width
        ty = random.uniform(-0.15, 0.15) * height
        translation_matrix = np.float32([[1, 0, tx], [0, 1, ty]])
        result = cv2.warpAffine(result, translation_matrix, (width, height), 
                              borderMode=cv2.BORDER_REFLECT)
        
        # Brightness adjustment (more randomized)
        brightness = random.uniform(0.7, 1.3)
        result = cv2.convertScaleAbs(result, alpha=brightness, beta=0)
        
        # Add slight zoom effect for uniqueness
        zoom_factor = random.uniform(0.9, 1.1)
        height, width = result.shape[:2]
        center_x, center_y = width // 2, height // 2
        new_width, new_height = int(width * zoom_factor), int(height * zoom_factor)
        
        # Calculate crop coordinates
        x1 = max(0, center_x - new_width // 2)
        y1 = max(0, center_y - new_height // 2)
        x2 = min(width, center_x + new_width // 2)
        y2 = min(height, center_y + new_height // 2)
        
        # Crop and resize
        cropped = result[y1:y2, x1:x2]
        result = cv2.resize(cropped, (width, height), interpolation=cv2.INTER_LINEAR)
    
    return result

# Generate augmented images
augmented_rows = []
print("\nGenerating augmented images...")

# Check extension of first few images to determine the most common extension
extension_counts = {}
for img_path in available_images[:20]:
    ext = os.path.splitext(img_path)[1].lower()
    extension_counts[ext] = extension_counts.get(ext, 0) + 1

# Use the most common extension for saving augmented images
if extension_counts:
    most_common_ext = max(extension_counts.items(), key=lambda x: x[1])[0]
    print(f"Most common extension is {most_common_ext}, using this for augmented images")
else:
    most_common_ext = '.png'  # Default to png based on the user's example
    print(f"Using default extension {most_common_ext} for augmented images")

for index, row in tqdm(df_original.iterrows(), total=len(df_original)):
    id_code = row['id_code']
    diagnosis = row['diagnosis']
    
    # Find the image file
    original_img_path, detected_ext = find_image_file(id_code, ORIGINAL_IMAGES_PATH)
    
    if original_img_path is None:
        print(f"Warning: Original image not found for {id_code}")
        continue
    
    # Read the image
    img = cv2.imread(original_img_path)
    if img is None:
        print(f"Warning: Could not read image for {id_code} at {original_img_path}")
        continue
    
    # Generate 2 distinctly different augmented versions
    for aug_num in range(1, 3):
        # Create new ID for the augmented image
        new_id_code = f"{id_code}_aug_{aug_num}"
        new_img_path = os.path.join(AUGMENTED_IMAGES_PATH, f"{new_id_code}{most_common_ext}")
        
        # Apply augmentation (use aug_num to determine augmentation type)
        aug_img = augment_image(img, aug_num)
        
        # Save the augmented image
        if cv2.imwrite(new_img_path, aug_img):
            # Add to augmented labels
            augmented_rows.append({
                'id_code': new_id_code,
                'diagnosis': diagnosis
            })
        else:
            print(f"Warning: Failed to save augmented image for {id_code} at {new_img_path}")

# Create DataFrame for augmented labels
df_augmented = pd.DataFrame(augmented_rows)
print(f"\nGenerated {len(df_augmented)} augmented images")
if len(df_augmented) > 0:
    print(f"Augmented class distribution: \n{df_augmented['diagnosis'].value_counts()}")
    
    # Save augmented labels
    df_augmented.to_csv(AUGMENTED_LABELS_PATH, index=False)
    print(f"Saved augmented labels to {AUGMENTED_LABELS_PATH}")
else:
    print("WARNING: No augmented images were created!")

In [None]:
# Combine original and augmented datasets
# 1. Copy original images to combined directory
print("\nCopying original images to combined directory...")
for index, row in tqdm(df_original.iterrows(), total=len(df_original)):
    id_code = row['id_code']
    original_img_path, detected_ext = find_image_file(id_code, ORIGINAL_IMAGES_PATH)
    
    if original_img_path is not None:
        dst = os.path.join(COMBINED_IMAGES_PATH, f"{id_code}{detected_ext}")
        shutil.copy2(original_img_path, dst)

# 2. Copy augmented images to combined directory
print("Copying augmented images to combined directory...")
for index, row in tqdm(df_augmented.iterrows(), total=len(df_augmented)):
    id_code = row['id_code']
    src = os.path.join(AUGMENTED_IMAGES_PATH, f"{id_code}{most_common_ext}")
    if os.path.exists(src):
        dst = os.path.join(COMBINED_IMAGES_PATH, f"{id_code}{most_common_ext}")
        shutil.copy2(src, dst)

# 3. Combine labels
df_combined = pd.concat([df_original, df_augmented], ignore_index=True)
print(f"\nCombined dataset contains {len(df_combined)} entries")
print(f"Combined class distribution: \n{df_combined['diagnosis'].value_counts()}")

# Calculate the expansion factor
if len(df_original) > 0:
    expansion_factor = len(df_combined) / len(df_original)
    print(f"Dataset has been expanded by a factor of {expansion_factor:.2f}x")
    
    # Verify we have the expected 3x expansion
    if abs(expansion_factor - 3.0) < 0.1:
        print("Successfully created a dataset 3x the original size!")
    else:
        print(f"WARNING: Dataset expansion factor ({expansion_factor:.2f}x) differs from the expected 3x")

# Save combined labels
df_combined.to_csv(COMBINED_LABELS_PATH, index=False)
print(f"Saved combined labels to {COMBINED_LABELS_PATH}")

In [None]:
# Verify data consistency
print("\nVerifying data consistency...")
image_files = set()
for ext in extensions:
    files = glob.glob(os.path.join(COMBINED_IMAGES_PATH, f"*{ext}"))
    image_files.update([os.path.splitext(os.path.basename(f))[0] for f in files])

label_ids = set(df_combined['id_code'])

# Find inconsistencies
missing_images = label_ids - image_files
orphaned_images = image_files - label_ids

print(f"Found {len(missing_images)} labels without corresponding images")
print(f"Found {len(orphaned_images)} images without corresponding labels")

In [None]:
# Clean up the dataset if necessary
if missing_images or orphaned_images:
    print("Cleaning up inconsistencies...")
    
    # Remove labels without images
    if missing_images:
        df_combined = df_combined[~df_combined['id_code'].isin(missing_images)]
    
    # Remove images without labels
    if orphaned_images:
        for img_id in orphaned_images:
            # Try to find and remove the image file with any extension
            found = False
            for ext in extensions:
                img_path = os.path.join(COMBINED_IMAGES_PATH, f"{img_id}{ext}")
                if os.path.exists(img_path):
                    os.remove(img_path)
                    found = True
                    break
            
            if not found:
                print(f"Warning: Could not find image file for orphaned ID {img_id}")
    
    # Save the cleaned dataset
    df_combined.to_csv(COMBINED_LABELS_PATH, index=False)
    print(f"Saved cleaned combined labels to {COMBINED_LABELS_PATH}")

print(f"\nFinal dataset contains {len(df_combined)} entries")
print(f"Final class distribution: \n{df_combined['diagnosis'].value_counts()}")

In [None]:
# Display 5 samples from each class
def display_samples(samples, title):
    plt.figure(figsize=(15, 10))
    for i, (index, row) in enumerate(samples.iterrows(), 1):
        id_code = row['id_code']
        
        # Try to find the image with any extension
        img_path = None
        for ext in extensions:
            temp_path = os.path.join(COMBINED_IMAGES_PATH, f"{id_code}{ext}")
            if os.path.exists(temp_path):
                img_path = temp_path
                break
        
        if img_path is not None:
            img = cv2.imread(img_path)
            if img is not None:
                # Convert BGR to RGB for proper display
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                
                plt.subplot(1, 5, i)
                plt.imshow(img)
                plt.title(f"ID: {id_code}\nClass: {row['diagnosis']}")
                plt.axis('off')
            else:
                print(f"Warning: Could not read image {img_path}")
        else:
            print(f"Warning: Image not found for {id_code}")
    
    plt.suptitle(title, fontsize=16)
    plt.tight_layout()
    plt.show()

In [None]:
import hashlib
from collections import defaultdict

In [None]:
# Define paths
COMBINED_IMAGES_PATH = r"E:\IRP_dataset_new\APTOS_combined_images"
COMBINED_LABELS_PATH = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"

In [None]:
# Extensions to check
extensions = ['.png', '.jpg', '.jpeg', '.tif', '.tiff']

In [None]:
# Function to compute the hash of an image file
def get_file_hash(file_path):
    """Calculate SHA-256 hash of a file"""
    try:
        with open(file_path, 'rb') as f:
            file_hash = hashlib.sha256(f.read()).hexdigest()
        return file_hash
    except Exception as e:
        print(f"Error calculating hash for {file_path}: {str(e)}")
        return None

# Function to verify image can be opened and is valid
def verify_image(file_path):
    """Check if an image file can be opened and is valid"""
    try:
        img = cv2.imread(file_path)
        if img is None:
            return False, "Cannot be read by OpenCV"
        
        # Check for completely black or white images
        if img.size == 0:
            return False, "Empty image"
        
        # Check if image is all black or all white
        if np.mean(img) < 1 or np.mean(img) > 254:
            return False, "Image is all black or all white"
        
        # Additional check: make sure image is not too small
        h, w = img.shape[:2]
        if h < 10 or w < 10:
            return False, f"Image too small ({w}x{h})"
            
        return True, None
    except Exception as e:
        return False, str(e)

print("\n1. Reading labels file...")
try:
    df_labels = pd.read_csv(COMBINED_LABELS_PATH)
    print(f"Found {len(df_labels)} entries in the labels file")
    print(f"Columns: {df_labels.columns.tolist()}")
    
    # Check for required columns
    if 'id_code' not in df_labels.columns:
        print("ERROR: 'id_code' column not found in labels file")
        id_column = df_labels.columns[0]  # Assume first column is ID
        print(f"Using '{id_column}' as ID column")
    else:
        id_column = 'id_code'
        
    if 'diagnosis' not in df_labels.columns:
        print("ERROR: 'diagnosis' column not found in labels file")
        
    # Check for duplicate IDs in labels
    duplicate_ids = df_labels[df_labels.duplicated(subset=[id_column], keep=False)]
    if len(duplicate_ids) > 0:
        print(f"WARNING: Found {len(duplicate_ids)} duplicate IDs in labels file")
        print("First few duplicates:")
        print(duplicate_ids.head())
    else:
        print("No duplicate IDs found in labels file")
        
except Exception as e:
    print(f"ERROR reading labels file: {str(e)}")
    raise

print("\n2. Scanning image directory...")
# Get all image files
all_image_files = []
for ext in extensions:
    files = glob.glob(os.path.join(COMBINED_IMAGES_PATH, f"*{ext}"))
    all_image_files.extend(files)

print(f"Found {len(all_image_files)} image files")

# Extract IDs from filenames
image_ids = []
for img_path in all_image_files:
    img_id = os.path.splitext(os.path.basename(img_path))[0]
    image_ids.append((img_id, img_path))

# Check if all image IDs from labels exist in the directory
print("\n3. Checking for missing images...")
label_ids = set(df_labels[id_column])
found_image_ids = {img_id for img_id, _ in image_ids}

missing_images = label_ids - found_image_ids
if missing_images:
    print(f"WARNING: Found {len(missing_images)} IDs in labels without corresponding images")
    print("First few missing IDs:")
    print(list(missing_images)[:5])
else:
    print("No missing images found")

# Check if all image files have corresponding labels
orphaned_images = found_image_ids - label_ids
if orphaned_images:
    print(f"WARNING: Found {len(orphaned_images)} images without corresponding labels")
    print("First few orphaned images:")
    print(list(orphaned_images)[:5])
else:
    print("No orphaned images found")
    
# Check for invalid or corrupted images
print("\n4. Checking for invalid or corrupted images...")
invalid_images = []

print("Processing images to check validity...")
for img_id, img_path in tqdm(image_ids):
    is_valid, error_msg = verify_image(img_path)
    if not is_valid:
        invalid_images.append((img_id, img_path, error_msg))

if invalid_images:
    print(f"WARNING: Found {len(invalid_images)} invalid or corrupted images")
    print("First few invalid images:")
    for img_id, img_path, error_msg in invalid_images[:5]:
        print(f"  {img_id}: {error_msg}")
else:
    print("No invalid images found")

# Check for duplicate images (same content but different filenames)
print("\n5. Checking for duplicate image content...")
print("Computing image hashes (this may take some time)...")

# Compute hashes for all files
file_hashes = {}
hash_to_files = defaultdict(list)

for img_id, img_path in tqdm(image_ids):
    file_hash = get_file_hash(img_path)
    if file_hash:
        file_hashes[img_path] = file_hash
        hash_to_files[file_hash].append((img_id, img_path))

# Find duplicates (files with the same hash)
duplicates = {h: files for h, files in hash_to_files.items() if len(files) > 1}

if duplicates:
    print(f"WARNING: Found {len(duplicates)} sets of duplicate images (same content, different names)")
    print("First few duplicate sets:")
    for i, (file_hash, files) in enumerate(list(duplicates.items())[:3]):
        print(f"Duplicate set {i+1}:")
        for img_id, img_path in files:
            print(f"  {img_id}: {img_path}")
else:
    print("No duplicate image content found")

In [None]:
# Identifies and removes duplicate images while preserving dataset integrity

import os
import pandas as pd
import hashlib
import glob
from tqdm.notebook import tqdm
from collections import defaultdict
import shutil

# Define paths
COMBINED_IMAGES_PATH = r"E:\IRP_dataset_new\APTOS_combined_images"
COMBINED_LABELS_PATH = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"
BACKUP_DIR = r"E:\IRP_dataset_new\duplicates_backup"  # Directory to back up removed files

# Extensions to check
extensions = ['.png', '.jpg', '.jpeg', '.tif', '.tiff']

print("APTOS Dataset Duplicate Removal")
print("=" * 50)

# Create backup directory if it doesn't exist
os.makedirs(BACKUP_DIR, exist_ok=True)
print(f"Created backup directory at {BACKUP_DIR}")

# Function to compute the hash of an image file
def get_file_hash(file_path):
    """Calculate SHA-256 hash of a file"""
    try:
        with open(file_path, 'rb') as f:
            file_hash = hashlib.sha256(f.read()).hexdigest()
        return file_hash
    except Exception as e:
        print(f"Error calculating hash for {file_path}: {str(e)}")
        return None

# Read the labels file
print("\nReading labels file...")
df_labels = pd.read_csv(COMBINED_LABELS_PATH)
print(f"Loaded {len(df_labels)} entries from labels file")

# Get all image files
print("\nScanning for image files...")
all_image_files = []
for ext in extensions:
    files = glob.glob(os.path.join(COMBINED_IMAGES_PATH, f"*{ext}"))
    all_image_files.extend(files)

print(f"Found {len(all_image_files)} image files")

# Compute hashes for all files
print("\nComputing file hashes to identify duplicates...")
file_hashes = {}
hash_to_files = defaultdict(list)

for img_path in tqdm(all_image_files):
    file_hash = get_file_hash(img_path)
    if file_hash:
        file_hashes[img_path] = file_hash
        hash_to_files[file_hash].append(img_path)

# Find duplicates (files with the same hash)
duplicates = {h: files for h, files in hash_to_files.items() if len(files) > 1}

print(f"\nFound {len(duplicates)} sets of duplicate images")

if duplicates:
    print("\nRemoving duplicates while preserving one copy of each image...")
    
    # Track files to be removed
    files_to_remove = []
    files_to_keep = []
    id_mapping = {}  # Map from removed ID to kept ID
    
    # Process each set of duplicates
    for hash_val, duplicate_files in tqdm(duplicates.items()):
        # Sort files so that we keep original images (without "_aug_") if possible
        sorted_files = sorted(duplicate_files, 
                              key=lambda x: "_aug_" in os.path.basename(x))
        
        # Keep the first file (preferentially an original, non-augmented image)
        file_to_keep = sorted_files[0]
        files_to_keep.append(file_to_keep)
        
        keep_id = os.path.splitext(os.path.basename(file_to_keep))[0]
        
        # Mark the rest for removal
        for file_to_remove in sorted_files[1:]:
            files_to_remove.append(file_to_remove)
            
            # Create ID mapping for label updates
            remove_id = os.path.splitext(os.path.basename(file_to_remove))[0]
            id_mapping[remove_id] = keep_id
    
    # Backup and remove duplicate files
    print(f"\nBacking up and removing {len(files_to_remove)} duplicate files...")
    
    for file_path in tqdm(files_to_remove):
        filename = os.path.basename(file_path)
        backup_path = os.path.join(BACKUP_DIR, filename)
        
        # Backup the file
        shutil.copy2(file_path, backup_path)
        
        # Remove the file
        os.remove(file_path)
    
    # Update the labels file
    print("\nUpdating labels file to reflect removed duplicates...")
    
    # Map to track if we've already updated a label to point to a particular ID
    updated_ids = set()
    rows_to_drop = []
    
    # First pass: update IDs that need mapping
    for i, row in tqdm(df_labels.iterrows(), total=len(df_labels)):
        id_code = row['id_code']
        
        # If this ID was removed, update it to point to the kept ID
        if id_code in id_mapping:
            # If we haven't already updated a row to use this kept ID
            if id_mapping[id_code] not in updated_ids:
                df_labels.at[i, 'id_code'] = id_mapping[id_code]
                updated_ids.add(id_mapping[id_code])
            else:
                # Mark this row for removal since we already have a row pointing to the kept ID
                rows_to_drop.append(i)
    
    # Drop duplicate rows
    if rows_to_drop:
        print(f"Removing {len(rows_to_drop)} redundant label entries...")
        df_labels = df_labels.drop(rows_to_drop)
    
    # Save the updated labels file
    backup_labels_path = os.path.join(BACKUP_DIR, "labels_backup.csv")
    shutil.copy2(COMBINED_LABELS_PATH, backup_labels_path)
    
    df_labels.to_csv(COMBINED_LABELS_PATH, index=False)
    print(f"Updated labels file saved to {COMBINED_LABELS_PATH}")
    
    # Verify the results
    print("\nVerifying results...")
    print(f"Original image count: {len(all_image_files)}")
    
    # Count remaining images
    remaining_images = []
    for ext in extensions:
        files = glob.glob(os.path.join(COMBINED_IMAGES_PATH, f"*{ext}"))
        remaining_images.extend(files)
    
    print(f"Remaining image count: {len(remaining_images)}")
    print(f"Removed duplicate count: {len(files_to_remove)}")
    print(f"Updated label count: {len(df_labels)}")
    
    print("\nDuplicate removal process completed successfully!")
    print(f"Removed {len(files_to_remove)} duplicate images")
    print(f"Updated labels file now contains {len(df_labels)} entries")
    print(f"Backup of all removed files saved to {BACKUP_DIR}")
else:
    print("No duplicates found. No changes were made.")

# Final verification to ensure all labels have corresponding images
print("\nPerforming final verification...")
missing_images = 0

for id_code in tqdm(df_labels['id_code']):
    found = False
    for ext in extensions:
        img_path = os.path.join(COMBINED_IMAGES_PATH, f"{id_code}{ext}")
        if os.path.exists(img_path):
            found = True
            break
    
    if not found:
        missing_images += 1

if missing_images > 0:
    print(f"WARNING: Found {missing_images} label entries without corresponding images")
    print("You may need to run the data validation script again to clean these up")
else:
    print("All label entries have corresponding images. Dataset is clean!")

print("\nDuplicate removal process complete!")

In [None]:
# APTOS Dataset Simple Excel Cleanup
# Removes entries from Excel file that don't have corresponding images

import os
import pandas as pd
import glob
import shutil
from tqdm.notebook import tqdm

# Define paths
COMBINED_IMAGES_PATH = r"E:\IRP_dataset_new\APTOS_combined_images"
COMBINED_LABELS_PATH = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"
BACKUP_DIR = r"E:\IRP_dataset_new\backups"  # Directory for backups

# Extensions to check
extensions = ['.png', '.jpg', '.jpeg', '.tif', '.tiff']

print("APTOS Dataset Simple Excel Cleanup")
print("=" * 50)

# Create backup directory if it doesn't exist
os.makedirs(BACKUP_DIR, exist_ok=True)

# Backup the original labels file
backup_filename = "APTOS_labels_combined_backup.csv"
backup_path = os.path.join(BACKUP_DIR, backup_filename)
shutil.copy2(COMBINED_LABELS_PATH, backup_path)
print(f"Created backup of labels file at {backup_path}")

# Read the labels file
print("\nReading labels file...")
df_labels = pd.read_csv(COMBINED_LABELS_PATH)
original_count = len(df_labels)
print(f"Loaded {original_count} entries from labels file")

# Get all image filenames without extensions
print("\nCollecting all image filenames...")
image_ids = set()
for ext in extensions:
    pattern = os.path.join(COMBINED_IMAGES_PATH, f"*{ext}")
    for img_path in glob.glob(pattern):
        filename = os.path.basename(img_path)
        image_id = os.path.splitext(filename)[0]  # Remove extension
        image_ids.add(image_id)

print(f"Found {len(image_ids)} unique image IDs")

# Check each Excel entry and mark those with missing images
print("\nChecking each Excel entry for corresponding image...")
missing_image_mask = ~df_labels['id_code'].isin(image_ids)
missing_entries = df_labels[missing_image_mask]

print(f"Found {len(missing_entries)} entries without corresponding images")

# Remove entries with missing images
if len(missing_entries) > 0:
    # Save the list of entries to be removed
    missing_entries_path = os.path.join(BACKUP_DIR, "removed_entries.csv")
    missing_entries.to_csv(missing_entries_path, index=False)
    print(f"Saved list of removed entries to {missing_entries_path}")
    
    # Create updated DataFrame with entries removed
    df_updated = df_labels[~missing_image_mask].copy()
    
    # Save the updated Excel file
    df_updated.to_csv(COMBINED_LABELS_PATH, index=False)
    print(f"Updated labels file saved to {COMBINED_LABELS_PATH}")
    
    # Print results
    print("\nCleanup Results:")
    print(f"Original entry count: {original_count}")
    print(f"Entries removed: {len(missing_entries)}")
    print(f"Remaining entries: {len(df_updated)}")
    
    # Print class distribution before and after if 'diagnosis' column exists
    if 'diagnosis' in df_labels.columns:
        print("\nClass distribution before cleanup:")
        print(df_labels['diagnosis'].value_counts().to_dict())
        
        print("\nClass distribution after cleanup:")
        print(df_updated['diagnosis'].value_counts().to_dict())
else:
    print("No entries need to be removed. All Excel entries have corresponding images.")

print("\nExcel cleanup complete!")

In [77]:
# Simple script to remove duplicate entries from the CSV file
import pandas as pd
import shutil
import os

# Define file paths
csv_path = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"
backup_path = r"E:\IRP_dataset_new\APTOS_labels_combined_backup.csv"

# Create a backup of the original file
print(f"Creating backup at {backup_path}")
shutil.copy2(csv_path, backup_path)

# Read the CSV file
print(f"Reading CSV file from {csv_path}")
df = pd.read_csv(csv_path)
original_count = len(df)
print(f"Original file contains {original_count} entries")

# Identify duplicates
duplicate_mask = df.duplicated(subset=['id_code'], keep=False)
duplicate_entries = df[duplicate_mask]
unique_duplicated_ids = duplicate_entries['id_code'].nunique()

print(f"Found {len(duplicate_entries)} duplicate entries")
print(f"These represent {unique_duplicated_ids} unique ID codes that appear multiple times")

# Remove duplicates, keeping the first occurrence
df_cleaned = df.drop_duplicates(subset=['id_code'], keep='first')
removed_count = original_count - len(df_cleaned)

print(f"Removed {removed_count} duplicate entries")
print(f"Cleaned file contains {len(df_cleaned)} entries")

# Save the cleaned dataframe back to the CSV file
print(f"Saving cleaned file to {csv_path}")
df_cleaned.to_csv(csv_path, index=False)

print("Duplicate removal complete!")
print(f"The original file with duplicates is backed up at: {backup_path}")

Creating backup at E:\IRP_dataset_new\APTOS_labels_combined_backup.csv
Reading CSV file from E:\IRP_dataset_new\APTOS_labels_combined.csv
Original file contains 10954 entries
Found 244 duplicate entries
These represent 122 unique ID codes that appear multiple times
Removed 122 duplicate entries
Cleaned file contains 10832 entries
Saving cleaned file to E:\IRP_dataset_new\APTOS_labels_combined.csv
Duplicate removal complete!
The original file with duplicates is backed up at: E:\IRP_dataset_new\APTOS_labels_combined_backup.csv


In [78]:
# Simple script to verify that Excel IDs match image folder contents
import pandas as pd
import glob
import os

def verify_dataset_consistency():
    # Define file paths
    excel_path = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"
    images_path = r"E:\IRP_dataset_new\APTOS_combined_images"
    
    # Extensions to check
    extensions = ['.png', '.jpg', '.jpeg', '.tif', '.tiff']
    
    print("Verifying dataset consistency...")
    
    # Read the Excel file
    df = pd.read_csv(excel_path)
    excel_count = len(df)
    unique_ids_count = df['id_code'].nunique()
    print(f"Excel file contains {excel_count} entries")
    print(f"Number of unique ID codes in Excel: {unique_ids_count}")
    
    # Count image files
    image_files = []
    for ext in extensions:
        files = glob.glob(os.path.join(images_path, f"*{ext}"))
        image_files.extend(files)
    
    image_count = len(image_files)
    print(f"Image folder contains {image_count} files")
    
    # Check for match
    if excel_count == image_count and excel_count == unique_ids_count:
        print("\n✓ SUCCESS: Excel file and image folder are perfectly synchronized!")
        print(f"Both contain exactly {excel_count} items.")
    else:
        print("\n⚠ WARNING: Excel file and image folder counts don't match.")
        print(f"Excel entries: {excel_count}")
        print(f"Unique ID codes: {unique_ids_count}")
        print(f"Image files: {image_count}")
    
    return {
        'excel_count': excel_count,
        'unique_ids_count': unique_ids_count,
        'image_count': image_count,
        'is_synchronized': excel_count == image_count and excel_count == unique_ids_count
    }

# Execute the function
if __name__ == "__main__":
    verify_dataset_consistency()
    
#122 duplicates were found after augmenting

Verifying dataset consistency...
Excel file contains 10832 entries
Number of unique ID codes in Excel: 10832
Image folder contains 10832 files

✓ SUCCESS: Excel file and image folder are perfectly synchronized!
Both contain exactly 10832 items.


In [79]:
# Simple script to check if Excel IDs exist in image folder
import os
import pandas as pd
import glob

# Define paths
COMBINED_IMAGES_PATH = r"E:\IRP_dataset_new\APTOS_combined_images"
COMBINED_LABELS_PATH = r"E:\IRP_dataset_new\APTOS_labels_combined.csv"

# Extensions to check
extensions = ['.png', '.jpg', '.jpeg', '.tif', '.tiff']

print("Simple ID Verification")
print("=" * 30)

# Read the Excel file
print("Reading Excel file...")
df = pd.read_csv(COMBINED_LABELS_PATH)
excel_ids = df['id_code'].tolist()
print(f"Found {len(excel_ids)} IDs in Excel file")

# Get all image files in the directory
print("Scanning image directory...")
all_images = []
for ext in extensions:
    images = glob.glob(os.path.join(COMBINED_IMAGES_PATH, f"*{ext}"))
    all_images.extend(images)

# Extract IDs from image filenames
image_ids = [os.path.splitext(os.path.basename(img))[0] for img in all_images]
print(f"Found {len(image_ids)} images in directory")

# Find IDs in Excel that don't have images
missing_images = []
for excel_id in excel_ids:
    if excel_id not in image_ids:
        missing_images.append(excel_id)

print(f"\nFound {len(missing_images)} IDs in Excel without corresponding images")
if missing_images:
    print("First 10 missing IDs:")
    for missing_id in missing_images[:10]:
        print(f"  {missing_id}")
    
    # Option to save missing IDs to file
    print("\nDo you want to save the complete list of missing IDs to a file? (yes/no)")
    save_option = input().strip().lower()
    if save_option == 'yes':
        output_path = r"E:\IRP_dataset_new\missing_ids.csv"
        pd.DataFrame({'id_code': missing_images}).to_csv(output_path, index=False)
        print(f"Saved missing IDs to {output_path}")

# Find images that don't have IDs in Excel
extra_images = []
for image_id in image_ids:
    if image_id not in excel_ids:
        extra_images.append(image_id)

print(f"\nFound {len(extra_images)} images without corresponding IDs in Excel")
if extra_images:
    print("First 10 extra images:")
    for extra_id in extra_images[:10]:
        print(f"  {extra_id}")
    
    # Option to save extra image IDs to file
    print("\nDo you want to save the complete list of extra image IDs to a file? (yes/no)")
    save_option = input().strip().lower()
    if save_option == 'yes':
        output_path = r"E:\IRP_dataset_new\extra_images.csv"
        pd.DataFrame({'id_code': extra_images}).to_csv(output_path, index=False)
        print(f"Saved extra image IDs to {output_path}")

print("\nVerification complete!")

Simple ID Verification
Reading Excel file...
Found 10832 IDs in Excel file
Scanning image directory...
Found 10832 images in directory

Found 0 IDs in Excel without corresponding images

Found 0 images without corresponding IDs in Excel

Verification complete!
