In [2]:
import os
import shutil
from PIL import Image
import random

# Base directory
base_path = "./dataset_binary"
photo_folder = os.path.join(base_path, "Photo")
no_photos_folder = os.path.join(base_path, "No_photos")

# Folders to merge
other_folders = [
    os.path.join(base_path, "Painting"),
    os.path.join(base_path, "Schematics"),
    os.path.join(base_path, "Sketch"),
    os.path.join(base_path, "Text")
]

# Ensure No_photos folder exists and is empty
if os.path.exists(no_photos_folder):
    shutil.rmtree(no_photos_folder)
os.makedirs(no_photos_folder)

# Function to check if a file is corrupted (image verification only)
def is_file_corrupted(filepath):
    try:
        with Image.open(filepath) as img:
            img.verify()
        return False
    except:
        return True

# Clean corrupted files and collect valid ones
valid_files = {}
for folder in other_folders:
    all_files = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    valid = [f for f in all_files if not is_file_corrupted(f)]
    valid_files[folder] = valid

# Get number of files in Photo folder
photo_files = [f for f in os.listdir(photo_folder) if os.path.isfile(os.path.join(photo_folder, f))]
n_photo_files = len(photo_files)

# Total valid files in the 4 folders
total_valid = sum(len(files) for files in valid_files.values())

# Copy proportional number of files into No_photos
for folder, files in valid_files.items():
    ratio = len(files) / total_valid
    n_to_copy = round(ratio * n_photo_files)
    selected_files = random.sample(files, min(n_to_copy, len(files)))
    
    for file_path in selected_files:
        shutil.copy(file_path, no_photos_folder)

print("✅ No_photos folder created with matching proportions and cleaned of corrupted files.")


✅ No_photos folder created with matching proportions and cleaned of corrupted files.
