In [30]:
# Step 3a
# Remove similar frames 
# Ran this first as a test - used a constant threshold of 97.5% 
# Filtered images based on SSIM then in a further step after analysing the results, filtered them baesd on a new threshold for each job
# Need to run this in a differnet environment flags2env 
# Switch using Anaconda prompt /Launch in Anaconda navigator import os
import cv2
import shutil
from skimage.metrics import structural_similarity as ssim
import pandas as pd

# lists to hold data
jobs= []
tgtimgs = []
srcimgs = []   
retainedpcs = [] 

# This first time we try out 97.5 percent then in a next tage refine thi further dependingon the 
#SIMILARITY_THRESHOLD = 0.95
SIMILARITY_THRESHOLD = 0.975
#SIMILARITY_THRESHOLD = 0.98
threshold_str = f"{SIMILARITY_THRESHOLD:.2f}"  # Format the threshold to two decimal places

parent_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsExtracted'
destination_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsExtractedStage2'
print(f"Similarity threshold: '{SIMILARITY_THRESHOLD}':")

def is_similar(image1, image2, threshold=SIMILARITY_THRESHOLD):

    # Check if dimensions are the same dimensions- SSIM require images to be same dimensions
    if image1.shape != image2.shape:
        return False  # Treat as not similar if dimensions differ
        
    """Compare two images using SSIM and check if they are similar."""
    gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray1, gray2, full=True)
    return score > threshold

# Iterate over each child folder in the parent folder
for child_folder in os.listdir(parent_folder):
    child_folder_path = os.path.join(parent_folder, child_folder)

    job = child_folder 

    obj_train_data_path = os.path.join(child_folder_path, 'obj_train_data')
    if os.path.exists(obj_train_data_path) and os.path.isdir(obj_train_data_path):
        # Prepare the output folder structure
        dest_subfolder = os.path.join(destination_folder, child_folder, 'obj_train_data')
        os.makedirs(dest_subfolder, exist_ok=True)
        
        # Get alphabetically sorted list of image files in obj_train_data
        #image_files = sorted([f for f in os.listdir(obj_train_data_path) if f.endswith('.jpg') or f.endswith('.png') or f.endswith('.PNG')])
        
        # Get numerically sorted list of image files in obj_train_data
        image_files = sorted([f for f in os.listdir(obj_train_data_path) if f.endswith('.jpg') or f.endswith('.png') or f.endswith('.PNG')],key=lambda x: int(x.split('_')[1].split('.')[0]))

        # Initialize for comparing images
        prev_image = None
        copied_images_count = 0
        total_images_count = len(image_files)
        #srcimgs= total_images_count

        for image_file in image_files:
            image_path = os.path.join(obj_train_data_path, image_file)
            text_path = os.path.splitext(image_path)[0] + '.txt'
            image = cv2.imread(image_path)
          
            # Copy the first image and its text file
            if prev_image is None:
                prev_image = image
                shutil.copy(image_path, os.path.join(dest_subfolder, image_file))
                if os.path.exists(text_path):
                    shutil.copy(text_path, os.path.join(dest_subfolder, os.path.basename(text_path)))
                copied_images_count += 1
                continue

            # Compare with previous image
            if not is_similar(prev_image, image):
                prev_image = image
                shutil.copy(image_path, os.path.join(dest_subfolder, image_file))
                if os.path.exists(text_path):
                    shutil.copy(text_path, os.path.join(dest_subfolder, os.path.basename(text_path)))
                copied_images_count += 1
                #tgtimg = copied_images_count
         
        # Append data to respective lists
        jobs.append(job)
        srcimgs.append(total_images_count)
        tgtimgs.append(copied_images_count)
        retainedpc = round((copied_images_count/total_images_count)*100,2)
        retainedpcs.append(retainedpc)
   
        # Print summary for the current child folder
        print(f"Job '{child_folder}':")
        print(f" - Exported : {total_images_count}")
        print(f" - Filtered: {copied_images_count}")
        # Count number of images in the output folder
        #output_images = len(os.listdir(output_folder))
        #print(f"Images in output folder: {output_images}")
        #percentCarried = (copied_images_count/total_images_count)*100 # swap this 
        #print(f"Percent retained: {retainedpc}")
        print(f"Retained %:{retainedpc:.2f}")
  
# Create DataFrame and save as CSV
df = pd.DataFrame({
     'Job': jobs,
     'Exported from CVAT': srcimgs,
     'Filtered': tgtimgs,
     'Retained %': retainedpcs,
     'Similarity threshold': SIMILARITY_THRESHOLD
})

filename = f'SSIM_Summary_Threshold_{threshold_str}.csv'
df.to_csv(filename, index=False)
#df.to_csv('SSIM_Summary.csv', index=False)
print(f"CSV file {filename} has been created.")
print("Processing complete. Summary for all jobs is above.")



Similarity threshold: '0.95':
Job 'Job_123':
 - Exported : 300
 - Filtered: 188
Retained %:62.67
CSV file SSIM_Summary_Threshold_0.95.csv has been created.
Processing complete. Summary for all jobs is above.
