In [2]:
# Step 3 a_Step_x_ReduceDataset_by_pattern.jpynb
# Dataset preparation - Delete patterns or sequences of files to reduce the dataset size - mitigate againt overfitting, reduce training time etc. 

import os
import csv
import re

def delete_by_pattern(whichfolder, patternInt, folder):
    """
    Sorts files in the folder by name. Keeps the first image-annotation pair, deletes the next four pairs,
    and writes a summary to a CSV file.

    Args:
        whichfolder (str): The base directory containing the folders.
        folder (str): The specific folder to process.
        sub_folder_train (str): The sub-folder inside the folder to process.
        csv_file (str): Path to the CSV file to write the summary.
    """
    print(f"Starting 1/{patternInt} pattern for {folder}")
    csv_file="dataset_reduced_by_pattern.csv"
    sub_folder_train = 'obj_train_data' 

    # Build the path to the target folder
    path = os.path.join(whichfolder, folder)
    path = os.path.join(path, sub_folder_train)

    # Ensure the folder exists
    if not os.path.exists(path):
        print(f"Error: The folder {path} does not exist.")
        return

    # List all files in the folder
    files = os.listdir(path)

    # Filter and sort files by type
    image_files = sorted([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    text_files = sorted([f for f in files if f.lower().endswith('.txt')])

    # Ensure file pairs (images and corresponding annotations)
    paired_files = []
    for image_file in image_files:
        base_name = os.path.splitext(image_file)[0]
        annotation_file = f"{base_name}.txt"
        if annotation_file in text_files:
            paired_files.append((image_file, annotation_file))

    # Record the initial count of image-annotation pairs
    total_pairs_before = len(paired_files)
    kept_count = 0
    deleted_count = 0

    # Keep one pair, delete the next n pairs
    for i, (image_file, annotation_file) in enumerate(paired_files):
        if i % patternInt == 0:
            # Keep this pair
            # print(f"Keeping: {image_file} and {annotation_file}")
            kept_count += 1
        else:
            # Delete this pair
            try:
                os.remove(os.path.join(path, image_file))
                os.remove(os.path.join(path, annotation_file))
                # print(f"Deleted: {image_file} and {annotation_file}")
                deleted_count += 1
            except Exception as e:
                print(f"Error deleting files {image_file} or {annotation_file}: {e}")

    # Calculate the number of pairs remaining
    total_pairs_after = kept_count

    # Write the summary to the CSV file
    try:
        with open(csv_file, mode='a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([folder, total_pairs_before, deleted_count, total_pairs_after])
        print(f"Summary written to {csv_file}")
    except Exception as e:
        print(f"Error writing to CSV file: {e}")

    print(f"Process complete. Kept {kept_count} images, deleted {deleted_count} image/annotation pairs from {folder}")

def delete_by_pattern_flat(whichfolder, patternInt, folder):
    """
    Sorts files in the folder by name. Keeps the first image-annotation pair, deletes the next four pairs,
    and writes a summary to a CSV file.

    Args:
        whichfolder (str): The base directory containing the folders.
        folder (str): The specific folder to process.
        sub_folder_train (str): The sub-folder inside the folder to process.
        csv_file (str): Path to the CSV file to write the summary.
    """
    print(f"Starting 1/{patternInt} pattern for {folder}")
    csv_file="dataset_reduced_by_pattern.csv"
    #sub_folder_train = 'obj_train_data' 

    # Build the path to the target folder
    path = os.path.join(whichfolder, folder)
    #path = os.path.join(path, sub_folder_train)

    # Ensure the folder exists
    if not os.path.exists(path):
        print(f"Error: The folder {path} does not exist.")
        return

    # List all files in the folder
    files = os.listdir(path)

    # Filter and sort files by type
    image_files = sorted([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    text_files = sorted([f for f in files if f.lower().endswith('.txt')])

    # Ensure file pairs (images and corresponding annotations)
    paired_files = []
    for image_file in image_files:
        base_name = os.path.splitext(image_file)[0]
        annotation_file = f"{base_name}.txt"
        if annotation_file in text_files:
            paired_files.append((image_file, annotation_file))

    # Record the initial count of image-annotation pairs
    total_pairs_before = len(paired_files)
    kept_count = 0
    deleted_count = 0

    # Keep one pair, delete the next n pairs
    for i, (image_file, annotation_file) in enumerate(paired_files):
        if i % patternInt == 0:
            # Keep this pair
            # print(f"Keeping: {image_file} and {annotation_file}")
            kept_count += 1
        else:
            # Delete this pair
            try:
                os.remove(os.path.join(path, image_file))
                os.remove(os.path.join(path, annotation_file))
                # print(f"Deleted: {image_file} and {annotation_file}")
                deleted_count += 1
            except Exception as e:
                print(f"Error deleting files {image_file} or {annotation_file}: {e}")

    # Calculate the number of pairs remaining
    total_pairs_after = kept_count

    # Write the summary to the CSV file
    try:
        with open(csv_file, mode='a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([folder, total_pairs_before, deleted_count, total_pairs_after])
        print(f"Summary written to {csv_file}")
    except Exception as e:
        print(f"Error writing to CSV file: {e}")

    print(f"Process complete. Kept {kept_count} images, deleted {deleted_count} image/annotation pairs from {folder}")


def delete_every_nth_flat(whichfolder, folder, deleteEveryInt):
    """
    Deletes image and annotation pairs according to a pattern (e.g., every nth pair) after sorting them.

    Args:
        whichfolder (str): The base directory containing the folders.
        folder (str): The specific folder to process.
        deleteEveryInt (int): Delete every nth file pair.
    """
    print(f"Starting deletion of every {deleteEveryInt}th file pair in {folder}")

    # Build the path to the target folder
    path = os.path.join(whichfolder, folder)

    # Ensure the folder exists
    if not os.path.exists(path):
        print(f"Error: The folder {path} does not exist.")
        return

    # List and sort image and annotation files
    files = sorted(os.listdir(path))
    image_files = sorted([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    annotation_files = sorted([f for f in files if f.lower().endswith('.txt')])

    # Ensure only paired files are considered
    paired_files = [
        (img, f"{os.path.splitext(img)[0]}.txt")
        for img in image_files
        if f"{os.path.splitext(img)[0]}.txt" in annotation_files
    ]

    deleted_count = 0

    for i, (image_file, annotation_file) in enumerate(paired_files):
        if (i + 1) % deleteEveryInt == 0:
            try:
                os.remove(os.path.join(path, image_file))
                os.remove(os.path.join(path, annotation_file))
                print(f"Deleted: {image_file} and {annotation_file}")
                deleted_count += 1
            except Exception as e:
                print(f"Error deleting pair {image_file} and {annotation_file}: {e}")

    print(f"Process complete. Deleted {deleted_count} file pairs from {folder}.")


                          
# Delete files by pattern in the selected set, e.g. delete every 2nd file in the set 
#base_path_tgt = 'D:/FlagDetectionDatasets/ExportedDatasetsReduced'
base_path_tgt = 'D:/FlagDetectionDatasets/ExportedDatasetsSelected'
#delete_every_nth_flat(base_path_tgt, 'Job_21', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_22', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_23', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_23_filter', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_28', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_29', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_30', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_31', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_32', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_36', 3)
#delete_every_nth_flat(base_path_tgt, 'Job_41', 2)
#delete_every_nth_flat(base_path_tgt, 'Job_43', 2)
#delete_every_nth_flat(base_path_tgt, 'Job_51', 2)
#delete_every_nth_flat(base_path_tgt, 'Job_51_Aug', 2)
#delete_every_nth_flat(base_path_tgt, 'Job_52_Aug', 2)

#delete_every_nth_flat(base_path_tgt, 'Job_54', 3) # ran twice 
#delete_every_nth_flat(base_path_tgt, 'Job_54', 3) 
#delete_every_nth_flat(base_path_tgt, 'Job_60', 3) 
#delete_every_nth_flat(base_path_tgt, 'Job_61', 3) 
#delete_every_nth_flat(base_path_tgt, 'Job_71', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_72', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_78', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_87', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_88', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_89', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_104', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_105', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_106', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_118', 2) 
#delete_every_nth_flat(base_path_tgt, 'Job_130', 2) 


##==========================================================================================#

#delete_by_pattern(base_path_tgt, 3, 'Job_106') 
#delete_by_pattern(base_path_tgt, 5, 'Job_108') 
#delete_by_pattern(base_path_tgt, 4, 'Job_70') 

#delete_by_pattern(base_path_tgt, 2, 'Job_7') # frame step of 5 left 0, 5, 10, This will reduce to 0 10 , 20 
#delete_by_pattern(base_path_tgt, 2, 'Job_11') # frame step of 5 left 1, 6, 11, This will reduce to 1, 11, 16, , 20 
#delete_by_pattern(base_path_tgt, 4, 'Job_65') 

#delete_by_pattern(base_path_tgt, 3, 'Job_116')  # DID TWICE 
#delete_by_pattern(base_path_tgt, 3, 'Job_114') 
#Starting 1/3 pattern for Job_114
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 96 images, deleted 192 image/annotation pairs from Job_114

#Starting 1/3 pattern for Job_116
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 176 images, deleted 352 image/annotation pairs from Job_116

# Delete sequence - keep 1, delete 4 -  keep 1/5
#delete_by_pattern(base_path_tgt,  folder="Job_15", sub_folder_train = r'obj_train_data',5)
#delete_by_pattern(base_path_tgt,  folder="Job_16", sub_folder_train = r'obj_train_data',5)
#delete_by_pattern(base_path_tgt,  folder="Job_17", sub_folder_train = r'obj_train_data',5)
#delete_by_pattern(base_path_tgt,  folder="Job_18", sub_folder_train = r'obj_train_data',5)

# Keep sequence is keep 1, delete 4, keep 1, delete 4... 
#delete_by_pattern(base_path_tgt, 5, 'Job_29')
#delete_by_pattern(base_path_tgt, 5, 'Job_41')
#delete_by_pattern(base_path_tgt, 4, 'Job_121_filter') 
#delete_by_pattern(base_path_tgt, 4, 'Job_121_a') # kept 2 deleted 8 


#delete_by_pattern_flat(base_path_tgt, 2, 'Job_48') 
#delete_by_pattern_flat(base_path_tgt, 2, 'Job_128') 

#base_path_tgt = 'D:/FlagDetectionDatasets/Augmentation/scaled/augmented'
#base_path_tgt = 'D:/FlagDetectionDatasets/Augmentation'
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_30') 
#base_path_tgt = 'D:/FlagDetectionDatasets/Augmentation'
#delete_by_pattern_flat(base_path_tgt, 10, 'Switch_flag_into') 

#delete_by_pattern_flat(base_path_tgt, 5, 'Job_123') 
#base_path_tgt = 'D:/FlagDetectionDatasets/Augmentation/scaled/augmented'
#delete_by_pattern_flat(base_path_tgt, 5, 'Job_120') 
#delete_by_pattern_flat(base_path_tgt, 5, 'Job_119') 
#delete_by_pattern_flat(base_path_tgt, 5, 'Job_118') 

#delete_by_pattern(base_path_tgt, 5, 'Job_115') 
#Starting 1/5 pattern for Job_115
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 63 images, deleted 249 image/annotation pairs from Job_115

# delete_by_pattern(base_path_tgt, 5, 'Job_126') # RAN TWICE 
#delete_by_pattern(base_path_tgt, 5, 'Job_128')
#delete_by_pattern(base_path_tgt, 5, 'Job_130')
#delete_by_pattern(base_path_tgt, 5, 'Job_131')  # did this twice 
#delete_by_pattern(base_path_tgt, 5, 'Job_142') # did this twice 
#delete_by_pattern(base_path_tgt, 5, 'Job_143') ## REDO 
#delete_by_pattern(base_path_tgt, 5, 'Job_147')  # Did this twice so 1/10 
#delete_by_pattern(base_path_tgt, 5, 'Job_160')  # WAS IN 5S ALREADY 
#delete_by_pattern(base_path_tgt, 5, 'Job_98')  # WAS IN 5S ALREADY 
#delete_by_pattern(base_path_tgt, 2, 'Job_98')  # then keep 1 in 2 so end result is 1 / 10 

## Example Log from above

#Starting 1/5 pattern for Job_143
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 27 images, deleted 105 image/annotation pairs from Job_143

#Starting 1/5 pattern for Job_126
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 156 images, deleted 624 image/annotation pairs from Job_126
#Starting 1/5 pattern for Job_128
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 107 images, deleted 425 image/annotation pairs from Job_128
#Starting 1/5 pattern for Job_130
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 111 images, deleted 441 image/annotation pairs from Job_130
#Starting 1/5 pattern for Job_131
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 108 images, deleted 432 image/annotation pairs from Job_131
#Starting 1/5 pattern for Job_142
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 106 images, deleted 422 image/annotation pairs from Job_142
#Starting 1/5 pattern for Job_160
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 59 images, deleted 234 image/annotation pairs from Job_160
#Starting 1/5 pattern for Job_160
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 59 images, deleted 234 image/annotation pairs from Job_160

# REDOO *********************************************
# Starting 1/5 pattern for Job_143
# Error: The folder D:/FlagDetectionDatasets/ExportedDatasetsReduced\Job_143\obj_train_data does not exist.

# Starting 1/5 pattern for Job_147
# Summary written to dataset_reduced_by_pattern.csv
# Process complete. Kept 106 images, deleted 422 image/annotation pairs from Job_147


# Keep 1, delete 9 pattern  - keep 1/10
#delete_by_pattern(base_path_tgt, 10, 'Job_30') ## CHECK PATTERN CORRECT?
#delete_by_pattern(base_path_tgt, 10, 'Job_31')
#delete_by_pattern(base_path_tgt, 10, 'Job_32')

#delete_by_pattern(base_path_tgt, 8, 'Job_72')  
#delete_by_pattern(base_path_tgt, 10, 'Job_73')  

#delete_by_pattern(base_path_tgt, 10, 'Job_73')
#Starting 1/10 pattern for Job_73
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 88 images, deleted 788 image/annotation pairs from Job_73

# Delete 
# delete_by_pattern(base_path_tgt,  2, 'Job_21')  # Keep all as frame step of 5 was effective 

# Keep 1/3 sequence 
#delete_by_pattern(base_path_tgt, 3, 'Job_118')

#delete_by_pattern(base_path_tgt, 3, 'Job_117')  

# Keep 1/4 sequence 
#delete_by_pattern(base_path_tgt, 4, 'Job_22')
#delete_by_pattern(base_path_tgt, 10, 'Job_25')
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_27')  
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_30')  
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_31')  
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_43')  
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_51')  
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_60')  

#delete_by_pattern(base_path_tgt, 4, 'Job_55')
#delete_by_pattern(base_path_tgt, 4, 'Job_56')
#delete_by_pattern(base_path_tgt, 4, 'Job_57')
#delete_by_pattern(base_path_tgt, 4, 'Job_59')
#delete_by_pattern(base_path_tgt, 4, 'Job_125')

#delete_by_pattern(base_path_tgt, 5, 'Job_78') COULD DELETE 1/5 AGAIN 
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_74')
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_75')
#delete_by_pattern_flat(base_path_tgt, 10, 'Job_77')
#delete_by_pattern(base_path_tgt, 10, 'Job_88')
#delete_by_pattern(base_path_tgt, 5, 'Job_95')  # handheld
#delete_by_pattern(base_path_tgt, 10, 'Job_36')  

#delete_by_pattern(base_path_tgt, 5, 'Job_89')
#delete_by_pattern(base_path_tgt, 5, 'Job_88')

#delete_by_pattern_flat(base_path_tgt, 10, 'Job_98')  

#delete_by_pattern_flat(base_path_tgt, 5, 'Job_109') 
#delete_by_pattern_flat(base_path_tgt, 5, 'Job_105')  

# Example output: 
# Starting 1/4 pattern for Job_22
# Summary written to dataset_reduced_by_pattern.csv
# Process complete. Kept 569 images, deleted 1705 image/annotation pairs from Job_22
#Starting 1/4 pattern for Job_125
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 96 images, deleted 288 image/annotation pairs from Job_125

# Example for Job_15
# Starting amt: 400. Deleted:  320 Remaining: 80
#sub_folder_train = r'obj_train_data'

## CHECK 
#Starting 1/10 pattern for Job_36
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 191 images, deleted 1710 image/annotation pairs from Job_36

#Starting 1/5 pattern for Job_120
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 58 images, deleted 230 image/annotation pairs from Job_120

#Starting 1/5 pattern for Job_120
#Summary written to dataset_reduced_by_pattern.csv
#Process complete. Kept 76 images, deleted 301 image/annotation pairs from Job_120


Deleted from folderToSplice2: Job_121_000000_scaled_3_0.PNG and Job_121_000000_scaled_3_0.txt
Deleted from folderToSplice1: Job_121_000008.PNG and Job_121_000008.txt
Deleted from folderToSplice2: Job_121_000016_scaled_3_0.PNG and Job_121_000016_scaled_3_0.txt
Deleted from folderToSplice1: Job_121_000024.PNG and Job_121_000024.txt
Deleted from folderToSplice2: Job_121_000032_scaled_3_0.PNG and Job_121_000032_scaled_3_0.txt
Deleted from folderToSplice1: Job_121_000040.PNG and Job_121_000040.txt
Deleted from folderToSplice2: Job_121_000048_scaled_3_0.PNG and Job_121_000048_scaled_3_0.txt
Deleted from folderToSplice1: Job_121_000056.PNG and Job_121_000056.txt
Deleted from folderToSplice2: Job_121_000064_scaled_3_0.PNG and Job_121_000064_scaled_3_0.txt
Deleted from folderToSplice1: Job_121_000072.PNG and Job_121_000072.txt
Deleted from folderToSplice2: Job_121_000080_scaled_3_0.PNG and Job_121_000080_scaled_3_0.txt
Deleted from folderToSplice1: Job_121_000088.PNG and Job_121_000088.txt
Dele