### Training and Validation Split

In [None]:
import os
import shutil
import random

# Define paths
source_dir = 'D:\Ashutosh\Herbs\Cleanede_Data\All700'
validation_dir = 'D:\Ashutosh\Herbs\Cleanede_Data\Val'
training_dir = 'D:\Ashutosh\Herbs\Cleanede_Data\Train'

# Define split ratios
validation_split = 0.2  # 20% for validation
training_split = 1 - validation_split  # 80% for training

# Ensure target directories exist
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(training_dir, exist_ok=True)

# Collect all image files from subdirectories
all_files = []
for subdir, _, files in os.walk(source_dir):
    for file_name in files:
        file_path = os.path.join(subdir, file_name)
        if not file_name.startswith('.'):  # Ignore hidden files
            all_files.append(file_path)

print("Total files collected:", len(all_files))

# Shuffle the list of files
random.shuffle(all_files)

# Calculate split indices
total_files = len(all_files)
validation_count = int(total_files * validation_split)

# Define splits
validation_files = all_files[:validation_count]
training_files = all_files[validation_count:]

# Function to move files while preserving directory structure
def move_files(file_list, target_dir):
    for file_path in file_list:
        # Create the same directory structure in the target directory
        rel_path = os.path.relpath(file_path, source_dir)
        target_path = os.path.join(target_dir, rel_path)
        os.makedirs(os.path.dirname(target_path), exist_ok=True)
        shutil.move(file_path, target_path)

# Move files to their respective directories
move_files(validation_files, validation_dir)
move_files(training_files, training_dir)

print(f"Moved {len(validation_files)} files to validation.")
print(f"Moved {len(training_files)} files to training.")
print(f"Remaining files: {0} (All files have been moved)")

### Check and remove corrupted files

In [None]:
from PIL import Image
import os

def remove_corrupted_images(directory):
    total_files = 0
    checked_files = 0
    corrupted_files = 0

    # First pass to count total files
    for subdir, _, files in os.walk(directory):
        total_files += len(files)

    print(f"Total files to check: {total_files}")

    # Second pass to check and remove corrupted images
    for subdir, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(subdir, file)
            try:
                with Image.open(file_path) as img:
                    img.verify()  # Verify image file
                checked_files += 1
                print(f"Checked image {file_path}")
            except (IOError, SyntaxError) as e:
                print(f"Corrupted image {file_path}: {e}")
                os.remove(file_path)
                corrupted_files += 1

    print(f"Checked {checked_files} images.")
    print(f"Removed {corrupted_files} corrupted images.")
    print(f"Total images processed: {total_files}")

# Run the function on your dataset directory
remove_corrupted_images("D:\\Ashutosh\\Herbs\\AllCombinedClasses")
