In [42]:
## Process jobs to filter image/annotation pairs using SSIM similarity threshold.
## Simplified now to just do one job 
    
import os
import cv2
import shutil
import pandas as pd
from skimage.metrics import structural_similarity as ssim

# Function to load thresholds from CSV
#def load_thresholds_from_csv(csv_file):
    #df = pd.read_csv(csv_file)
    #return {row['Job']: row['New threshold'] for _, row in df.iterrows() if str(row['Job']).startswith('Job_')}, df

# Function to check if images are similar
def is_similar(image1, image2, threshold):
    if image1.shape != image2.shape:
        return False
    gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray1, gray2, full=True)
    return score > threshold

# Function to extract numeric part from filenames for sorting
def extract_numeric_part(filename):
    return int(''.join(filter(str.isdigit, filename)))

# Function to process jobs
def process_jobs(parent_folder, dest_folder, job, threshold):
    """
    Process jobs to filter image/annotation pairs using SSIM similarity threshold.
    
    Args:
        parent_folder (str): Path to the parent folder containing job subfolders.
        threshold: threshold
        dest_folder (str): Path to the destination folder.
        job (str): job to process. 
    """
    _, csv_df = load_thresholds_from_csv(csv_file)
    copied_counts = []

    suffix = "_SSIM"

    jobs_to_process = [job] if job else thresholds.keys()

    for job in jobs_to_process:
        if job not in thresholds:
            print(f"Error: Job '{job}' not found in thresholds. Skipping.")
            continue

        threshold = thresholds[job]
        job_folder = os.path.join(parent_folder, job, 'obj_train_data')
        output_folder = os.path.join(dest_folder, job + suffix, 'obj_train_data')

        print(f"Processing {job}:  threshold: {threshold}")

        # Skip job if target folder already contains images
        if os.path.exists(output_folder) and len(os.listdir(output_folder)) > 0:
            print(f"Skipping {job} as images already exist in the target folder.")
            continue

        os.makedirs(output_folder, exist_ok=True)

        # Get sorted list of image files in obj_train_data
        image_files = sorted(
            [f for f in os.listdir(job_folder) if f.lower().endswith(('.jpg', '.png'))],
            key=extract_numeric_part
        )

        prev_image = None
        copied_images_count = 0
        for image_file in image_files:
            image_path = os.path.join(job_folder, image_file)
            text_path = os.path.splitext(image_path)[0] + '.txt'
            image = cv2.imread(image_path)

            # Copy the first image and text file
            if prev_image is None:
                prev_image = image
                shutil.copy(image_path, os.path.join(output_folder, image_file))
                if os.path.exists(text_path):
                    shutil.copy(text_path, os.path.join(output_folder, os.path.basename(text_path)))
                copied_images_count += 1
                continue

            # Handle copy without comparison if threshold is NaN
            if pd.isna(threshold):
                shutil.copy(image_path, os.path.join(output_folder, image_file))
                if os.path.exists(text_path):
                    shutil.copy(text_path, os.path.join(output_folder, os.path.basename(text_path)))
                copied_images_count += 1
            else:
                if not is_similar(prev_image, image, threshold):
                    prev_image = image
                    shutil.copy(image_path, os.path.join(output_folder, image_file))
                    if os.path.exists(text_path):
                        shutil.copy(text_path, os.path.join(output_folder, os.path.basename(text_path)))
                    copied_images_count += 1

        # Log the job and image counts
        total_images_count = len(image_files)
        print(f"From Job '{job}':")
        print(f" - Before processing : {total_images_count}")
        print(f" - Filtered to {job + suffix} after processing: {copied_images_count}")
        retainedpc = round((copied_images_count / total_images_count) * 100, 2)
        print(f" - Retained %: {retainedpc:.2f}")
       
        # Update the copied image count for each job
        copied_counts.append({'Job': job, 'Copied Images': copied_images_count})

    # Merge counts into the original CSV and save it
    counts_df = pd.DataFrame(copied_counts)
    updated_df = csv_df.merge(counts_df, on='Job', how='left')
    updated_df.to_csv(csv_file, index=False)

# Run it 
csv_file = 'ThresholdForSSIM_single.csv'
parent_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsReduced'
destination_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsReduced'

# Load thresholds and process a specific job
# thresholds, _ = load_thresholds_from_csv(csv_file)
job_name = "Job_119"  # These were the flags that are separated in this instance 
process_jobs(parent_folder, destination_folder, job_name, 0.96)


Error: Job 'Job_119' not found in thresholds. Skipping.


KeyError: 'Job'