In [23]:
## Conpare and filter Frames Round 2 v2 
## Explanation of Updates
## Adding copied_counts: Each job's copied image count is stored in a dictionary list called copied_counts.
## Writing to CSV: After processing all jobs, copied_counts is converted to a DataFrame and merged with the original CSV on the Job column, then saved back to the CSV file with the new column.
## This will update the CSV with the new copied image counts for each job after processing.
## sTARTONGAT 23:41 NOV 1ST 
import os
import cv2
import shutil
import pandas as pd
from skimage.metrics import structural_similarity as ssim

# COPY JOBS THAT SHOULDN'TBE FILTERED BUT ARE STILL FOR TESTING AND UPDATE CODE 

# Function to load thresholds from CSV
def load_thresholds_from_csv(csv_file):
    df = pd.read_csv(csv_file)
    return {row['Job']: row['New threshold'] for _, row in df.iterrows() if str(row['Job']).startswith('Job_')}, df

# Function to check if images are similar
def is_similar(image1, image2, threshold):
    #print(f"Processing {job}:  threshold: {threshold}")

    if image1.shape != image2.shape:
        return False
    gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray1, gray2, full=True)
    return score > threshold

# Function to extract numeric part from filenames for sorting
def extract_numeric_part(filename):
    return int(''.join(filter(str.isdigit, filename)))

# Function to process jobs
def process_jobs(parent_folder, thresholds, dest_folder, csv_file):
    _, csv_df = load_thresholds_from_csv(csv_file)
    copied_counts = []

    for job, threshold in thresholds.items():
        # The threshold is in the format e.g.95 for 95%
        # NB REMOVE THIS Need to divide by 100 
        ## threshold = threshold/100
        job_folder = os.path.join(parent_folder, job, 'obj_train_data')
        output_folder = os.path.join(dest_folder, job, 'obj_train_data')

        print(f"Processing {job}:  threshold: {threshold}")

        # Skip job if target folder already contains images
        if os.path.exists(output_folder) and len(os.listdir(output_folder)) > 0:
            print(f"Skipping {job} as images already exist in the target folder.")
            continue

        os.makedirs(output_folder, exist_ok=True)

        # Get sorted list of image files in obj_train_data
        image_files = sorted(
            [f for f in os.listdir(job_folder) if f.lower().endswith(('.jpg', '.png'))],
            key=extract_numeric_part
        )

        prev_image = None
        copied_images_count = 0
        for image_file in image_files:
            image_path = os.path.join(job_folder, image_file)
            text_path = os.path.splitext(image_path)[0] + '.txt'
            image = cv2.imread(image_path)

            # Copy the first image and text file
            if prev_image is None:
                prev_image = image
           
                shutil.copy(image_path, os.path.join(output_folder, image_file))
                if os.path.exists(text_path):
                    shutil.copy(text_path, os.path.join(output_folder, os.path.basename(text_path)))
                copied_images_count += 1
                continue

            # Handle copy without comparison if threshold is NaN
            if pd.isna(threshold):
                shutil.copy(image_path, os.path.join(output_folder, image_file))
                if os.path.exists(text_path):
                    shutil.copy(text_path, os.path.join(output_folder, os.path.basename(text_path)))
                copied_images_count += 1
            else:
                # print(f" - Checkingimage silimarity using threshold: {threshold}")
                if not is_similar(prev_image, image, threshold):
                    prev_image = image
                    shutil.copy(image_path, os.path.join(output_folder, image_file))
                    if os.path.exists(text_path):
                        shutil.copy(text_path, os.path.join(output_folder, os.path.basename(text_path)))
                    copied_images_count += 1

        # Log the job and image counts
        total_images_count = len(image_files)
        #print(f"Processed {job}: {total_images_count} images before processing, {copied_images_count} images copied, threshold: {threshold}")

        # Print summary for the current child folder
        print(f"Job '{job}':")
        print(f" - Before processing : {total_images_count}")
        print(f" - Filtered after processing: {copied_images_count}")
        retainedpc = round((copied_images_count/total_images_count)*100,2)
        print(f" - Retained %:{retainedpc:.2f}")
  
        # Update the copied image count for each job
        copied_counts.append({'Job': job, 'Copied Images': copied_images_count})

    # Merge counts into the original CSV and save it
    counts_df = pd.DataFrame(copied_counts)
    updated_df = csv_df.merge(counts_df, on='Job', how='left')
    updated_df.to_csv(csv_file, index=False)

# Load the CSV file with the thresholds for each job and specify the source and target folders. 
csv_file = 'Thresholds_for_SSIM_f_.csv'
parent_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsExtracted'
destination_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsExtractedStage2'

#csv_file = 'Thresholds_for_SSIM_Round2Test.csv'
#parent_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsExtractedTest'
#destination_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsExtractedStage2'

# Load the thresholds for each job and run compare images using SSIM 
thresholds, _ = load_thresholds_from_csv(csv_file)
process_jobs(parent_folder, thresholds, destination_folder, csv_file)


Processing Job_97:  threshold: nan
Skipping Job_97 as images already exist in the target folder.
Processing Job_96:  threshold: 0.9
Job 'Job_96':
 - Before processing : 300
 - Filtered after processing: 1
 - Retained %:0.33
Processing Job_95:  threshold: nan
Skipping Job_95 as images already exist in the target folder.
Processing Job_88:  threshold: 0.97
Job 'Job_88':
 - Before processing : 708
 - Filtered after processing: 339
 - Retained %:47.88
Processing Job_76:  threshold: 0.97
Job 'Job_76':
 - Before processing : 540
 - Filtered after processing: 535
 - Retained %:99.07
Processing Job_73:  threshold: 0.97
Job 'Job_73':
 - Before processing : 601
 - Filtered after processing: 601
 - Retained %:100.00
Processing Job_70:  threshold: nan
Skipping Job_70 as images already exist in the target folder.
Processing Job_69:  threshold: 0.96
Job 'Job_69':
 - Before processing : 297
 - Filtered after processing: 158
 - Retained %:53.20
Processing Job_65:  threshold: 0.965
Job 'Job_65':
 - Bef